1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that ARM uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "ARMISelLowering.h" 15 #include "ARMBaseInstrInfo.h" 16 #include "ARMBaseRegisterInfo.h" 17 #include "ARMCallingConv.h" 18 #include "ARMConstantPoolValue.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMPerfectShuffle.h" 21 #include "ARMRegisterInfo.h" 22 #include "ARMSelectionDAGInfo.h" 23 #include "ARMSubtarget.h" 24 #include "ARMTargetTransformInfo.h" 25 #include "MCTargetDesc/ARMAddressingModes.h" 26 #include "MCTargetDesc/ARMBaseInfo.h" 27 #include "Utils/ARMBaseInfo.h" 28 #include "llvm/ADT/APFloat.h" 29 #include "llvm/ADT/APInt.h" 30 #include "llvm/ADT/ArrayRef.h" 31 #include "llvm/ADT/BitVector.h" 32 #include "llvm/ADT/DenseMap.h" 33 #include "llvm/ADT/STLExtras.h" 34 #include "llvm/ADT/SmallPtrSet.h" 35 #include "llvm/ADT/SmallVector.h" 36 #include "llvm/ADT/Statistic.h" 37 #include "llvm/ADT/StringExtras.h" 38 #include "llvm/ADT/StringRef.h" 39 #include "llvm/ADT/StringSwitch.h" 40 #include "llvm/ADT/Triple.h" 41 #include "llvm/ADT/Twine.h" 42 #include "llvm/Analysis/VectorUtils.h" 43 #include "llvm/CodeGen/CallingConvLower.h" 44 #include "llvm/CodeGen/ISDOpcodes.h" 45 #include "llvm/CodeGen/IntrinsicLowering.h" 46 #include "llvm/CodeGen/MachineBasicBlock.h" 47 #include "llvm/CodeGen/MachineConstantPool.h" 48 #include "llvm/CodeGen/MachineFrameInfo.h" 49 #include "llvm/CodeGen/MachineFunction.h" 50 #include "llvm/CodeGen/MachineInstr.h" 51 #include "llvm/CodeGen/MachineInstrBuilder.h" 52 #include "llvm/CodeGen/MachineJumpTableInfo.h" 53 #include "llvm/CodeGen/MachineMemOperand.h" 54 #include "llvm/CodeGen/MachineOperand.h" 55 #include "llvm/CodeGen/MachineRegisterInfo.h" 56 #include "llvm/CodeGen/RuntimeLibcalls.h" 57 #include "llvm/CodeGen/SelectionDAG.h" 58 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" 59 #include "llvm/CodeGen/SelectionDAGNodes.h" 60 #include "llvm/CodeGen/TargetInstrInfo.h" 61 #include "llvm/CodeGen/TargetLowering.h" 62 #include "llvm/CodeGen/TargetOpcodes.h" 63 #include "llvm/CodeGen/TargetRegisterInfo.h" 64 #include "llvm/CodeGen/TargetSubtargetInfo.h" 65 #include "llvm/CodeGen/ValueTypes.h" 66 #include "llvm/IR/Attributes.h" 67 #include "llvm/IR/CallingConv.h" 68 #include "llvm/IR/Constant.h" 69 #include "llvm/IR/Constants.h" 70 #include "llvm/IR/DataLayout.h" 71 #include "llvm/IR/DebugLoc.h" 72 #include "llvm/IR/DerivedTypes.h" 73 #include "llvm/IR/Function.h" 74 #include "llvm/IR/GlobalAlias.h" 75 #include "llvm/IR/GlobalValue.h" 76 #include "llvm/IR/GlobalVariable.h" 77 #include "llvm/IR/IRBuilder.h" 78 #include "llvm/IR/InlineAsm.h" 79 #include "llvm/IR/Instruction.h" 80 #include "llvm/IR/Instructions.h" 81 #include "llvm/IR/IntrinsicInst.h" 82 #include "llvm/IR/Intrinsics.h" 83 #include "llvm/IR/IntrinsicsARM.h" 84 #include "llvm/IR/Module.h" 85 #include "llvm/IR/PatternMatch.h" 86 #include "llvm/IR/Type.h" 87 #include "llvm/IR/User.h" 88 #include "llvm/IR/Value.h" 89 #include "llvm/MC/MCInstrDesc.h" 90 #include "llvm/MC/MCInstrItineraries.h" 91 #include "llvm/MC/MCRegisterInfo.h" 92 #include "llvm/MC/MCSchedule.h" 93 #include "llvm/Support/AtomicOrdering.h" 94 #include "llvm/Support/BranchProbability.h" 95 #include "llvm/Support/Casting.h" 96 #include "llvm/Support/CodeGen.h" 97 #include "llvm/Support/CommandLine.h" 98 #include "llvm/Support/Compiler.h" 99 #include "llvm/Support/Debug.h" 100 #include "llvm/Support/ErrorHandling.h" 101 #include "llvm/Support/KnownBits.h" 102 #include "llvm/Support/MachineValueType.h" 103 #include "llvm/Support/MathExtras.h" 104 #include "llvm/Support/raw_ostream.h" 105 #include "llvm/Target/TargetMachine.h" 106 #include "llvm/Target/TargetOptions.h" 107 #include <algorithm> 108 #include <cassert> 109 #include <cstdint> 110 #include <cstdlib> 111 #include <iterator> 112 #include <limits> 113 #include <string> 114 #include <tuple> 115 #include <utility> 116 #include <vector> 117 118 using namespace llvm; 119 using namespace llvm::PatternMatch; 120 121 #define DEBUG_TYPE "arm-isel" 122 123 STATISTIC(NumTailCalls, "Number of tail calls"); 124 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 125 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 126 STATISTIC(NumConstpoolPromoted, 127 "Number of constants with their storage promoted into constant pools"); 128 129 static cl::opt<bool> 130 ARMInterworking("arm-interworking", cl::Hidden, 131 cl::desc("Enable / disable ARM interworking (for debugging only)"), 132 cl::init(true)); 133 134 static cl::opt<bool> EnableConstpoolPromotion( 135 "arm-promote-constant", cl::Hidden, 136 cl::desc("Enable / disable promotion of unnamed_addr constants into " 137 "constant pools"), 138 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 139 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 140 "arm-promote-constant-max-size", cl::Hidden, 141 cl::desc("Maximum size of constant to promote into a constant pool"), 142 cl::init(64)); 143 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 144 "arm-promote-constant-max-total", cl::Hidden, 145 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 146 cl::init(128)); 147 148 cl::opt<unsigned> 149 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, 150 cl::desc("Maximum interleave factor for MVE VLDn to generate."), 151 cl::init(2)); 152 153 // The APCS parameter registers. 154 static const MCPhysReg GPRArgRegs[] = { 155 ARM::R0, ARM::R1, ARM::R2, ARM::R3 156 }; 157 158 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { 159 if (VT != PromotedLdStVT) { 160 setOperationAction(ISD::LOAD, VT, Promote); 161 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 162 163 setOperationAction(ISD::STORE, VT, Promote); 164 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 165 } 166 167 MVT ElemTy = VT.getVectorElementType(); 168 if (ElemTy != MVT::f64) 169 setOperationAction(ISD::SETCC, VT, Custom); 170 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 171 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 172 if (ElemTy == MVT::i32) { 173 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 174 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 175 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 176 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 177 } else { 178 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 179 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 180 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 181 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 182 } 183 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 184 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 185 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 186 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 187 setOperationAction(ISD::SELECT, VT, Expand); 188 setOperationAction(ISD::SELECT_CC, VT, Expand); 189 setOperationAction(ISD::VSELECT, VT, Expand); 190 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 191 if (VT.isInteger()) { 192 setOperationAction(ISD::SHL, VT, Custom); 193 setOperationAction(ISD::SRA, VT, Custom); 194 setOperationAction(ISD::SRL, VT, Custom); 195 } 196 197 // Neon does not support vector divide/remainder operations. 198 setOperationAction(ISD::SDIV, VT, Expand); 199 setOperationAction(ISD::UDIV, VT, Expand); 200 setOperationAction(ISD::FDIV, VT, Expand); 201 setOperationAction(ISD::SREM, VT, Expand); 202 setOperationAction(ISD::UREM, VT, Expand); 203 setOperationAction(ISD::FREM, VT, Expand); 204 setOperationAction(ISD::SDIVREM, VT, Expand); 205 setOperationAction(ISD::UDIVREM, VT, Expand); 206 207 if (!VT.isFloatingPoint() && 208 VT != MVT::v2i64 && VT != MVT::v1i64) 209 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 210 setOperationAction(Opcode, VT, Legal); 211 if (!VT.isFloatingPoint()) 212 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) 213 setOperationAction(Opcode, VT, Legal); 214 } 215 216 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 217 addRegisterClass(VT, &ARM::DPRRegClass); 218 addTypeForNEON(VT, MVT::f64); 219 } 220 221 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 222 addRegisterClass(VT, &ARM::DPairRegClass); 223 addTypeForNEON(VT, MVT::v2f64); 224 } 225 226 void ARMTargetLowering::setAllExpand(MVT VT) { 227 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) 228 setOperationAction(Opc, VT, Expand); 229 230 // We support these really simple operations even on types where all 231 // the actual arithmetic has to be broken down into simpler 232 // operations or turned into library calls. 233 setOperationAction(ISD::BITCAST, VT, Legal); 234 setOperationAction(ISD::LOAD, VT, Legal); 235 setOperationAction(ISD::STORE, VT, Legal); 236 setOperationAction(ISD::UNDEF, VT, Legal); 237 } 238 239 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, 240 LegalizeAction Action) { 241 setLoadExtAction(ISD::EXTLOAD, From, To, Action); 242 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); 243 setLoadExtAction(ISD::SEXTLOAD, From, To, Action); 244 } 245 246 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { 247 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; 248 249 for (auto VT : IntTypes) { 250 addRegisterClass(VT, &ARM::MQPRRegClass); 251 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 252 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 253 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 254 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 255 setOperationAction(ISD::SHL, VT, Custom); 256 setOperationAction(ISD::SRA, VT, Custom); 257 setOperationAction(ISD::SRL, VT, Custom); 258 setOperationAction(ISD::SMIN, VT, Legal); 259 setOperationAction(ISD::SMAX, VT, Legal); 260 setOperationAction(ISD::UMIN, VT, Legal); 261 setOperationAction(ISD::UMAX, VT, Legal); 262 setOperationAction(ISD::ABS, VT, Legal); 263 setOperationAction(ISD::SETCC, VT, Custom); 264 setOperationAction(ISD::MLOAD, VT, Custom); 265 setOperationAction(ISD::MSTORE, VT, Legal); 266 setOperationAction(ISD::CTLZ, VT, Legal); 267 setOperationAction(ISD::CTTZ, VT, Custom); 268 setOperationAction(ISD::BITREVERSE, VT, Legal); 269 setOperationAction(ISD::BSWAP, VT, Legal); 270 setOperationAction(ISD::SADDSAT, VT, Legal); 271 setOperationAction(ISD::UADDSAT, VT, Legal); 272 setOperationAction(ISD::SSUBSAT, VT, Legal); 273 setOperationAction(ISD::USUBSAT, VT, Legal); 274 setOperationAction(ISD::ABDS, VT, Legal); 275 setOperationAction(ISD::ABDU, VT, Legal); 276 setOperationAction(ISD::AVGFLOORS, VT, Legal); 277 setOperationAction(ISD::AVGFLOORU, VT, Legal); 278 setOperationAction(ISD::AVGCEILS, VT, Legal); 279 setOperationAction(ISD::AVGCEILU, VT, Legal); 280 281 // No native support for these. 282 setOperationAction(ISD::UDIV, VT, Expand); 283 setOperationAction(ISD::SDIV, VT, Expand); 284 setOperationAction(ISD::UREM, VT, Expand); 285 setOperationAction(ISD::SREM, VT, Expand); 286 setOperationAction(ISD::UDIVREM, VT, Expand); 287 setOperationAction(ISD::SDIVREM, VT, Expand); 288 setOperationAction(ISD::CTPOP, VT, Expand); 289 setOperationAction(ISD::SELECT, VT, Expand); 290 setOperationAction(ISD::SELECT_CC, VT, Expand); 291 292 // Vector reductions 293 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); 294 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); 295 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); 296 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); 297 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); 298 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom); 299 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 300 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 301 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 302 303 if (!HasMVEFP) { 304 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 305 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 306 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 307 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 308 } else { 309 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); 310 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); 311 } 312 313 // Pre and Post inc are supported on loads and stores 314 for (unsigned im = (unsigned)ISD::PRE_INC; 315 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 316 setIndexedLoadAction(im, VT, Legal); 317 setIndexedStoreAction(im, VT, Legal); 318 setIndexedMaskedLoadAction(im, VT, Legal); 319 setIndexedMaskedStoreAction(im, VT, Legal); 320 } 321 } 322 323 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; 324 for (auto VT : FloatTypes) { 325 addRegisterClass(VT, &ARM::MQPRRegClass); 326 if (!HasMVEFP) 327 setAllExpand(VT); 328 329 // These are legal or custom whether we have MVE.fp or not 330 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 331 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 332 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); 333 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 334 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 335 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); 336 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 337 setOperationAction(ISD::SETCC, VT, Custom); 338 setOperationAction(ISD::MLOAD, VT, Custom); 339 setOperationAction(ISD::MSTORE, VT, Legal); 340 setOperationAction(ISD::SELECT, VT, Expand); 341 setOperationAction(ISD::SELECT_CC, VT, Expand); 342 343 // Pre and Post inc are supported on loads and stores 344 for (unsigned im = (unsigned)ISD::PRE_INC; 345 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 346 setIndexedLoadAction(im, VT, Legal); 347 setIndexedStoreAction(im, VT, Legal); 348 setIndexedMaskedLoadAction(im, VT, Legal); 349 setIndexedMaskedStoreAction(im, VT, Legal); 350 } 351 352 if (HasMVEFP) { 353 setOperationAction(ISD::FMINNUM, VT, Legal); 354 setOperationAction(ISD::FMAXNUM, VT, Legal); 355 setOperationAction(ISD::FROUND, VT, Legal); 356 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 357 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); 358 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 359 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 360 361 // No native support for these. 362 setOperationAction(ISD::FDIV, VT, Expand); 363 setOperationAction(ISD::FREM, VT, Expand); 364 setOperationAction(ISD::FSQRT, VT, Expand); 365 setOperationAction(ISD::FSIN, VT, Expand); 366 setOperationAction(ISD::FCOS, VT, Expand); 367 setOperationAction(ISD::FPOW, VT, Expand); 368 setOperationAction(ISD::FLOG, VT, Expand); 369 setOperationAction(ISD::FLOG2, VT, Expand); 370 setOperationAction(ISD::FLOG10, VT, Expand); 371 setOperationAction(ISD::FEXP, VT, Expand); 372 setOperationAction(ISD::FEXP2, VT, Expand); 373 setOperationAction(ISD::FNEARBYINT, VT, Expand); 374 } 375 } 376 377 // Custom Expand smaller than legal vector reductions to prevent false zero 378 // items being added. 379 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom); 380 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom); 381 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom); 382 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom); 383 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom); 384 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom); 385 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom); 386 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom); 387 388 // We 'support' these types up to bitcast/load/store level, regardless of 389 // MVE integer-only / float support. Only doing FP data processing on the FP 390 // vector types is inhibited at integer-only level. 391 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; 392 for (auto VT : LongTypes) { 393 addRegisterClass(VT, &ARM::MQPRRegClass); 394 setAllExpand(VT); 395 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 396 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 397 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 398 setOperationAction(ISD::VSELECT, VT, Legal); 399 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 400 } 401 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); 402 403 // We can do bitwise operations on v2i64 vectors 404 setOperationAction(ISD::AND, MVT::v2i64, Legal); 405 setOperationAction(ISD::OR, MVT::v2i64, Legal); 406 setOperationAction(ISD::XOR, MVT::v2i64, Legal); 407 408 // It is legal to extload from v4i8 to v4i16 or v4i32. 409 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); 410 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); 411 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); 412 413 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16. 414 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); 415 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); 416 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); 417 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal); 418 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal); 419 420 // Some truncating stores are legal too. 421 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 422 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 423 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 424 425 // Pre and Post inc on these are legal, given the correct extends 426 for (unsigned im = (unsigned)ISD::PRE_INC; 427 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 428 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { 429 setIndexedLoadAction(im, VT, Legal); 430 setIndexedStoreAction(im, VT, Legal); 431 setIndexedMaskedLoadAction(im, VT, Legal); 432 setIndexedMaskedStoreAction(im, VT, Legal); 433 } 434 } 435 436 // Predicate types 437 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1}; 438 for (auto VT : pTypes) { 439 addRegisterClass(VT, &ARM::VCCRRegClass); 440 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 441 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 442 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 443 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 444 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 445 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 446 setOperationAction(ISD::SETCC, VT, Custom); 447 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 448 setOperationAction(ISD::LOAD, VT, Custom); 449 setOperationAction(ISD::STORE, VT, Custom); 450 setOperationAction(ISD::TRUNCATE, VT, Custom); 451 setOperationAction(ISD::VSELECT, VT, Expand); 452 setOperationAction(ISD::SELECT, VT, Expand); 453 setOperationAction(ISD::SELECT_CC, VT, Expand); 454 455 if (!HasMVEFP) { 456 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 457 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 458 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 459 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 460 } 461 } 462 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 463 setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand); 464 setOperationAction(ISD::AND, MVT::v2i1, Expand); 465 setOperationAction(ISD::OR, MVT::v2i1, Expand); 466 setOperationAction(ISD::XOR, MVT::v2i1, Expand); 467 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand); 468 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand); 469 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand); 470 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand); 471 472 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 473 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 474 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); 475 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 476 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); 477 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); 478 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); 479 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); 480 } 481 482 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 483 const ARMSubtarget &STI) 484 : TargetLowering(TM), Subtarget(&STI) { 485 RegInfo = Subtarget->getRegisterInfo(); 486 Itins = Subtarget->getInstrItineraryData(); 487 488 setBooleanContents(ZeroOrOneBooleanContent); 489 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 490 491 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 492 !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) { 493 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 494 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 495 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 496 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 497 : CallingConv::ARM_AAPCS); 498 } 499 500 if (Subtarget->isTargetMachO()) { 501 // Uses VFP for Thumb libfuncs if available. 502 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && 503 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 504 static const struct { 505 const RTLIB::Libcall Op; 506 const char * const Name; 507 const ISD::CondCode Cond; 508 } LibraryCalls[] = { 509 // Single-precision floating-point arithmetic. 510 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 511 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 512 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 513 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 514 515 // Double-precision floating-point arithmetic. 516 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 517 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 518 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 519 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 520 521 // Single-precision comparisons. 522 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 523 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 524 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 525 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 526 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 527 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 528 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 529 530 // Double-precision comparisons. 531 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 532 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 533 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 534 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 535 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 536 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 537 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 538 539 // Floating-point to integer conversions. 540 // i64 conversions are done via library routines even when generating VFP 541 // instructions, so use the same ones. 542 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 543 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 544 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 545 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 546 547 // Conversions between floating types. 548 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 549 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 550 551 // Integer to floating-point conversions. 552 // i64 conversions are done via library routines even when generating VFP 553 // instructions, so use the same ones. 554 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 555 // e.g., __floatunsidf vs. __floatunssidfvfp. 556 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 557 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 558 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 559 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 560 }; 561 562 for (const auto &LC : LibraryCalls) { 563 setLibcallName(LC.Op, LC.Name); 564 if (LC.Cond != ISD::SETCC_INVALID) 565 setCmpLibcallCC(LC.Op, LC.Cond); 566 } 567 } 568 } 569 570 // These libcalls are not available in 32-bit. 571 setLibcallName(RTLIB::SHL_I128, nullptr); 572 setLibcallName(RTLIB::SRL_I128, nullptr); 573 setLibcallName(RTLIB::SRA_I128, nullptr); 574 setLibcallName(RTLIB::MUL_I128, nullptr); 575 setLibcallName(RTLIB::MULO_I64, nullptr); 576 setLibcallName(RTLIB::MULO_I128, nullptr); 577 578 // RTLIB 579 if (Subtarget->isAAPCS_ABI() && 580 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 581 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 582 static const struct { 583 const RTLIB::Libcall Op; 584 const char * const Name; 585 const CallingConv::ID CC; 586 const ISD::CondCode Cond; 587 } LibraryCalls[] = { 588 // Double-precision floating-point arithmetic helper functions 589 // RTABI chapter 4.1.2, Table 2 590 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 591 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 592 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 593 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 594 595 // Double-precision floating-point comparison helper functions 596 // RTABI chapter 4.1.2, Table 3 597 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 598 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 599 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 600 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 601 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 602 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 603 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 604 605 // Single-precision floating-point arithmetic helper functions 606 // RTABI chapter 4.1.2, Table 4 607 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 608 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 609 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 610 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 611 612 // Single-precision floating-point comparison helper functions 613 // RTABI chapter 4.1.2, Table 5 614 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 615 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 616 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 617 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 618 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 619 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 620 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 621 622 // Floating-point to integer conversions. 623 // RTABI chapter 4.1.2, Table 6 624 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 625 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 626 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 627 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 628 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 629 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 630 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 631 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 632 633 // Conversions between floating types. 634 // RTABI chapter 4.1.2, Table 7 635 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 636 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 637 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 638 639 // Integer to floating-point conversions. 640 // RTABI chapter 4.1.2, Table 8 641 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 642 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 643 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 644 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 645 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 646 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 647 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 648 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 649 650 // Long long helper functions 651 // RTABI chapter 4.2, Table 9 652 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 653 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 654 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 655 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 656 657 // Integer division functions 658 // RTABI chapter 4.3.1 659 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 660 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 661 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 662 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 663 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 664 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 665 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 666 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 667 }; 668 669 for (const auto &LC : LibraryCalls) { 670 setLibcallName(LC.Op, LC.Name); 671 setLibcallCallingConv(LC.Op, LC.CC); 672 if (LC.Cond != ISD::SETCC_INVALID) 673 setCmpLibcallCC(LC.Op, LC.Cond); 674 } 675 676 // EABI dependent RTLIB 677 if (TM.Options.EABIVersion == EABI::EABI4 || 678 TM.Options.EABIVersion == EABI::EABI5) { 679 static const struct { 680 const RTLIB::Libcall Op; 681 const char *const Name; 682 const CallingConv::ID CC; 683 const ISD::CondCode Cond; 684 } MemOpsLibraryCalls[] = { 685 // Memory operations 686 // RTABI chapter 4.3.4 687 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 688 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 689 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 690 }; 691 692 for (const auto &LC : MemOpsLibraryCalls) { 693 setLibcallName(LC.Op, LC.Name); 694 setLibcallCallingConv(LC.Op, LC.CC); 695 if (LC.Cond != ISD::SETCC_INVALID) 696 setCmpLibcallCC(LC.Op, LC.Cond); 697 } 698 } 699 } 700 701 if (Subtarget->isTargetWindows()) { 702 static const struct { 703 const RTLIB::Libcall Op; 704 const char * const Name; 705 const CallingConv::ID CC; 706 } LibraryCalls[] = { 707 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 708 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 709 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 710 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 711 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 712 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 713 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 714 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 715 }; 716 717 for (const auto &LC : LibraryCalls) { 718 setLibcallName(LC.Op, LC.Name); 719 setLibcallCallingConv(LC.Op, LC.CC); 720 } 721 } 722 723 // Use divmod compiler-rt calls for iOS 5.0 and later. 724 if (Subtarget->isTargetMachO() && 725 !(Subtarget->isTargetIOS() && 726 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 727 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 728 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 729 } 730 731 // The half <-> float conversion functions are always soft-float on 732 // non-watchos platforms, but are needed for some targets which use a 733 // hard-float calling convention by default. 734 if (!Subtarget->isTargetWatchABI()) { 735 if (Subtarget->isAAPCS_ABI()) { 736 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 737 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 738 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 739 } else { 740 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 741 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 742 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 743 } 744 } 745 746 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 747 // a __gnu_ prefix (which is the default). 748 if (Subtarget->isTargetAEABI()) { 749 static const struct { 750 const RTLIB::Libcall Op; 751 const char * const Name; 752 const CallingConv::ID CC; 753 } LibraryCalls[] = { 754 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 755 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 756 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 757 }; 758 759 for (const auto &LC : LibraryCalls) { 760 setLibcallName(LC.Op, LC.Name); 761 setLibcallCallingConv(LC.Op, LC.CC); 762 } 763 } 764 765 if (Subtarget->isThumb1Only()) 766 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 767 else 768 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 769 770 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && 771 Subtarget->hasFPRegs()) { 772 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 773 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 774 775 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom); 776 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom); 777 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); 778 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); 779 780 if (!Subtarget->hasVFP2Base()) 781 setAllExpand(MVT::f32); 782 if (!Subtarget->hasFP64()) 783 setAllExpand(MVT::f64); 784 } 785 786 if (Subtarget->hasFullFP16()) { 787 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 788 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 789 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 790 791 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 792 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 793 } 794 795 if (Subtarget->hasBF16()) { 796 addRegisterClass(MVT::bf16, &ARM::HPRRegClass); 797 setAllExpand(MVT::bf16); 798 if (!Subtarget->hasFullFP16()) 799 setOperationAction(ISD::BITCAST, MVT::bf16, Custom); 800 } 801 802 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 803 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 804 setTruncStoreAction(VT, InnerVT, Expand); 805 addAllExtLoads(VT, InnerVT, Expand); 806 } 807 808 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 809 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 810 811 setOperationAction(ISD::BSWAP, VT, Expand); 812 } 813 814 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 815 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 816 817 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 818 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 819 820 if (Subtarget->hasMVEIntegerOps()) 821 addMVEVectorTypes(Subtarget->hasMVEFloatOps()); 822 823 // Combine low-overhead loop intrinsics so that we can lower i1 types. 824 if (Subtarget->hasLOB()) { 825 setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC}); 826 } 827 828 if (Subtarget->hasNEON()) { 829 addDRTypeForNEON(MVT::v2f32); 830 addDRTypeForNEON(MVT::v8i8); 831 addDRTypeForNEON(MVT::v4i16); 832 addDRTypeForNEON(MVT::v2i32); 833 addDRTypeForNEON(MVT::v1i64); 834 835 addQRTypeForNEON(MVT::v4f32); 836 addQRTypeForNEON(MVT::v2f64); 837 addQRTypeForNEON(MVT::v16i8); 838 addQRTypeForNEON(MVT::v8i16); 839 addQRTypeForNEON(MVT::v4i32); 840 addQRTypeForNEON(MVT::v2i64); 841 842 if (Subtarget->hasFullFP16()) { 843 addQRTypeForNEON(MVT::v8f16); 844 addDRTypeForNEON(MVT::v4f16); 845 } 846 847 if (Subtarget->hasBF16()) { 848 addQRTypeForNEON(MVT::v8bf16); 849 addDRTypeForNEON(MVT::v4bf16); 850 } 851 } 852 853 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { 854 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 855 // none of Neon, MVE or VFP supports any arithmetic operations on it. 856 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 857 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 858 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 859 // FIXME: Code duplication: FDIV and FREM are expanded always, see 860 // ARMTargetLowering::addTypeForNEON method for details. 861 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 862 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 863 // FIXME: Create unittest. 864 // In another words, find a way when "copysign" appears in DAG with vector 865 // operands. 866 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 867 // FIXME: Code duplication: SETCC has custom operation action, see 868 // ARMTargetLowering::addTypeForNEON method for details. 869 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 870 // FIXME: Create unittest for FNEG and for FABS. 871 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 872 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 873 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 874 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 875 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 876 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 877 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 878 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 879 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 880 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 881 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 882 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 883 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 884 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 885 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 886 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 887 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 888 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 889 } 890 891 if (Subtarget->hasNEON()) { 892 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 893 // supported for v4f32. 894 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 895 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 896 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 897 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 898 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 899 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 900 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 901 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 902 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 903 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 904 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 905 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 906 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 907 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 908 909 // Mark v2f32 intrinsics. 910 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 911 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 912 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 913 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 914 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 915 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 916 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 917 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 918 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 919 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 920 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 921 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 922 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 923 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 924 925 // Neon does not support some operations on v1i64 and v2i64 types. 926 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 927 // Custom handling for some quad-vector types to detect VMULL. 928 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 929 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 930 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 931 // Custom handling for some vector types to avoid expensive expansions 932 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 933 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 934 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 935 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 936 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 937 // a destination type that is wider than the source, and nor does 938 // it have a FP_TO_[SU]INT instruction with a narrower destination than 939 // source. 940 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 941 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 942 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 943 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 944 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 945 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 946 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 947 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 948 949 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 950 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 951 952 // NEON does not have single instruction CTPOP for vectors with element 953 // types wider than 8-bits. However, custom lowering can leverage the 954 // v8i8/v16i8 vcnt instruction. 955 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 956 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 957 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 958 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 959 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 960 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 961 962 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 963 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 964 965 // NEON does not have single instruction CTTZ for vectors. 966 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 967 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 968 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 969 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 970 971 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 972 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 973 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 974 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 975 976 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 977 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 978 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 979 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 980 981 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 982 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 983 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 984 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 985 986 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 987 setOperationAction(ISD::MULHS, VT, Expand); 988 setOperationAction(ISD::MULHU, VT, Expand); 989 } 990 991 // NEON only has FMA instructions as of VFP4. 992 if (!Subtarget->hasVFP4Base()) { 993 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 994 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 995 } 996 997 setTargetDAGCombine({ISD::SHL, ISD::SRL, ISD::SRA, ISD::FP_TO_SINT, 998 ISD::FP_TO_UINT, ISD::FDIV, ISD::LOAD}); 999 1000 // It is legal to extload from v4i8 to v4i16 or v4i32. 1001 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 1002 MVT::v2i32}) { 1003 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 1004 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 1005 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 1006 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 1007 } 1008 } 1009 } 1010 1011 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 1012 setTargetDAGCombine( 1013 {ISD::BUILD_VECTOR, ISD::VECTOR_SHUFFLE, ISD::INSERT_SUBVECTOR, 1014 ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, 1015 ISD::SIGN_EXTEND_INREG, ISD::STORE, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND, 1016 ISD::ANY_EXTEND, ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN, 1017 ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST}); 1018 } 1019 if (Subtarget->hasMVEIntegerOps()) { 1020 setTargetDAGCombine({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX, 1021 ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC, 1022 ISD::SETCC}); 1023 } 1024 if (Subtarget->hasMVEFloatOps()) { 1025 setTargetDAGCombine(ISD::FADD); 1026 } 1027 1028 if (!Subtarget->hasFP64()) { 1029 // When targeting a floating-point unit with only single-precision 1030 // operations, f64 is legal for the few double-precision instructions which 1031 // are present However, no double-precision operations other than moves, 1032 // loads and stores are provided by the hardware. 1033 setOperationAction(ISD::FADD, MVT::f64, Expand); 1034 setOperationAction(ISD::FSUB, MVT::f64, Expand); 1035 setOperationAction(ISD::FMUL, MVT::f64, Expand); 1036 setOperationAction(ISD::FMA, MVT::f64, Expand); 1037 setOperationAction(ISD::FDIV, MVT::f64, Expand); 1038 setOperationAction(ISD::FREM, MVT::f64, Expand); 1039 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 1040 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 1041 setOperationAction(ISD::FNEG, MVT::f64, Expand); 1042 setOperationAction(ISD::FABS, MVT::f64, Expand); 1043 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 1044 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1045 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1046 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1047 setOperationAction(ISD::FLOG, MVT::f64, Expand); 1048 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 1049 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 1050 setOperationAction(ISD::FEXP, MVT::f64, Expand); 1051 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 1052 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 1053 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 1054 setOperationAction(ISD::FRINT, MVT::f64, Expand); 1055 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 1056 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 1057 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 1058 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 1059 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 1060 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 1061 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 1062 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 1063 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 1064 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 1065 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 1066 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom); 1067 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom); 1068 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 1069 } 1070 1071 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { 1072 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 1073 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); 1074 if (Subtarget->hasFullFP16()) { 1075 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 1076 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 1077 } 1078 } 1079 1080 if (!Subtarget->hasFP16()) { 1081 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); 1082 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); 1083 } 1084 1085 computeRegisterProperties(Subtarget->getRegisterInfo()); 1086 1087 // ARM does not have floating-point extending loads. 1088 for (MVT VT : MVT::fp_valuetypes()) { 1089 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 1090 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 1091 } 1092 1093 // ... or truncating stores 1094 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1095 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 1096 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 1097 1098 // ARM does not have i1 sign extending load. 1099 for (MVT VT : MVT::integer_valuetypes()) 1100 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 1101 1102 // ARM supports all 4 flavors of integer indexed load / store. 1103 if (!Subtarget->isThumb1Only()) { 1104 for (unsigned im = (unsigned)ISD::PRE_INC; 1105 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1106 setIndexedLoadAction(im, MVT::i1, Legal); 1107 setIndexedLoadAction(im, MVT::i8, Legal); 1108 setIndexedLoadAction(im, MVT::i16, Legal); 1109 setIndexedLoadAction(im, MVT::i32, Legal); 1110 setIndexedStoreAction(im, MVT::i1, Legal); 1111 setIndexedStoreAction(im, MVT::i8, Legal); 1112 setIndexedStoreAction(im, MVT::i16, Legal); 1113 setIndexedStoreAction(im, MVT::i32, Legal); 1114 } 1115 } else { 1116 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 1117 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 1118 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 1119 } 1120 1121 setOperationAction(ISD::SADDO, MVT::i32, Custom); 1122 setOperationAction(ISD::UADDO, MVT::i32, Custom); 1123 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 1124 setOperationAction(ISD::USUBO, MVT::i32, Custom); 1125 1126 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 1127 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 1128 if (Subtarget->hasDSP()) { 1129 setOperationAction(ISD::SADDSAT, MVT::i8, Custom); 1130 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); 1131 setOperationAction(ISD::SADDSAT, MVT::i16, Custom); 1132 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); 1133 setOperationAction(ISD::UADDSAT, MVT::i8, Custom); 1134 setOperationAction(ISD::USUBSAT, MVT::i8, Custom); 1135 setOperationAction(ISD::UADDSAT, MVT::i16, Custom); 1136 setOperationAction(ISD::USUBSAT, MVT::i16, Custom); 1137 } 1138 if (Subtarget->hasBaseDSP()) { 1139 setOperationAction(ISD::SADDSAT, MVT::i32, Legal); 1140 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); 1141 } 1142 1143 // i64 operation support. 1144 setOperationAction(ISD::MUL, MVT::i64, Expand); 1145 setOperationAction(ISD::MULHU, MVT::i32, Expand); 1146 if (Subtarget->isThumb1Only()) { 1147 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 1148 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 1149 } 1150 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 1151 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 1152 setOperationAction(ISD::MULHS, MVT::i32, Expand); 1153 1154 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 1155 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 1156 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 1157 setOperationAction(ISD::SRL, MVT::i64, Custom); 1158 setOperationAction(ISD::SRA, MVT::i64, Custom); 1159 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1160 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1161 setOperationAction(ISD::LOAD, MVT::i64, Custom); 1162 setOperationAction(ISD::STORE, MVT::i64, Custom); 1163 1164 // MVE lowers 64 bit shifts to lsll and lsrl 1165 // assuming that ISD::SRL and SRA of i64 are already marked custom 1166 if (Subtarget->hasMVEIntegerOps()) 1167 setOperationAction(ISD::SHL, MVT::i64, Custom); 1168 1169 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 1170 if (Subtarget->isThumb1Only()) { 1171 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 1172 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 1173 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 1174 } 1175 1176 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 1177 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 1178 1179 // ARM does not have ROTL. 1180 setOperationAction(ISD::ROTL, MVT::i32, Expand); 1181 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1182 setOperationAction(ISD::ROTL, VT, Expand); 1183 setOperationAction(ISD::ROTR, VT, Expand); 1184 } 1185 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 1186 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 1187 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 1188 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 1189 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 1190 } 1191 1192 // @llvm.readcyclecounter requires the Performance Monitors extension. 1193 // Default to the 0 expansion on unsupported platforms. 1194 // FIXME: Technically there are older ARM CPUs that have 1195 // implementation-specific ways of obtaining this information. 1196 if (Subtarget->hasPerfMon()) 1197 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 1198 1199 // Only ARMv6 has BSWAP. 1200 if (!Subtarget->hasV6Ops()) 1201 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 1202 1203 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 1204 : Subtarget->hasDivideInARMMode(); 1205 if (!hasDivide) { 1206 // These are expanded into libcalls if the cpu doesn't have HW divider. 1207 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 1208 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 1209 } 1210 1211 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 1212 setOperationAction(ISD::SDIV, MVT::i32, Custom); 1213 setOperationAction(ISD::UDIV, MVT::i32, Custom); 1214 1215 setOperationAction(ISD::SDIV, MVT::i64, Custom); 1216 setOperationAction(ISD::UDIV, MVT::i64, Custom); 1217 } 1218 1219 setOperationAction(ISD::SREM, MVT::i32, Expand); 1220 setOperationAction(ISD::UREM, MVT::i32, Expand); 1221 1222 // Register based DivRem for AEABI (RTABI 4.2) 1223 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 1224 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 1225 Subtarget->isTargetWindows()) { 1226 setOperationAction(ISD::SREM, MVT::i64, Custom); 1227 setOperationAction(ISD::UREM, MVT::i64, Custom); 1228 HasStandaloneRem = false; 1229 1230 if (Subtarget->isTargetWindows()) { 1231 const struct { 1232 const RTLIB::Libcall Op; 1233 const char * const Name; 1234 const CallingConv::ID CC; 1235 } LibraryCalls[] = { 1236 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1237 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1238 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1239 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 1240 1241 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 1242 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 1243 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 1244 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 1245 }; 1246 1247 for (const auto &LC : LibraryCalls) { 1248 setLibcallName(LC.Op, LC.Name); 1249 setLibcallCallingConv(LC.Op, LC.CC); 1250 } 1251 } else { 1252 const struct { 1253 const RTLIB::Libcall Op; 1254 const char * const Name; 1255 const CallingConv::ID CC; 1256 } LibraryCalls[] = { 1257 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1258 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1259 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1260 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 1261 1262 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1263 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1264 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1265 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 1266 }; 1267 1268 for (const auto &LC : LibraryCalls) { 1269 setLibcallName(LC.Op, LC.Name); 1270 setLibcallCallingConv(LC.Op, LC.CC); 1271 } 1272 } 1273 1274 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 1275 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 1276 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 1277 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 1278 } else { 1279 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 1280 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 1281 } 1282 1283 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 1284 // MSVCRT doesn't have powi; fall back to pow 1285 setLibcallName(RTLIB::POWI_F32, nullptr); 1286 setLibcallName(RTLIB::POWI_F64, nullptr); 1287 } 1288 1289 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 1290 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 1291 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 1292 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 1293 1294 setOperationAction(ISD::TRAP, MVT::Other, Legal); 1295 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 1296 1297 // Use the default implementation. 1298 setOperationAction(ISD::VASTART, MVT::Other, Custom); 1299 setOperationAction(ISD::VAARG, MVT::Other, Expand); 1300 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 1301 setOperationAction(ISD::VAEND, MVT::Other, Expand); 1302 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 1303 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 1304 1305 if (Subtarget->isTargetWindows()) 1306 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 1307 else 1308 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 1309 1310 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 1311 // the default expansion. 1312 InsertFencesForAtomic = false; 1313 if (Subtarget->hasAnyDataBarrier() && 1314 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 1315 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 1316 // to ldrex/strex loops already. 1317 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 1318 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 1319 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 1320 1321 // On v8, we have particularly efficient implementations of atomic fences 1322 // if they can be combined with nearby atomic loads and stores. 1323 if (!Subtarget->hasAcquireRelease() || 1324 getTargetMachine().getOptLevel() == 0) { 1325 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 1326 InsertFencesForAtomic = true; 1327 } 1328 } else { 1329 // If there's anything we can use as a barrier, go through custom lowering 1330 // for ATOMIC_FENCE. 1331 // If target has DMB in thumb, Fences can be inserted. 1332 if (Subtarget->hasDataBarrier()) 1333 InsertFencesForAtomic = true; 1334 1335 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1336 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1337 1338 // Set them all for expansion, which will force libcalls. 1339 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1340 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1341 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1342 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1343 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1344 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1345 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1346 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1347 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1348 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1349 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1350 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1351 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1352 // Unordered/Monotonic case. 1353 if (!InsertFencesForAtomic) { 1354 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1355 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1356 } 1357 } 1358 1359 // Compute supported atomic widths. 1360 if (Subtarget->isTargetLinux() || 1361 (!Subtarget->isMClass() && Subtarget->hasV6Ops())) { 1362 // For targets where __sync_* routines are reliably available, we use them 1363 // if necessary. 1364 // 1365 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic 1366 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt? 1367 // 1368 // ARMv6 targets have native instructions in ARM mode. For Thumb mode, 1369 // such targets should provide __sync_* routines, which use the ARM mode 1370 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent 1371 // encoding; see ARMISD::MEMBARRIER_MCR.) 1372 setMaxAtomicSizeInBitsSupported(64); 1373 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) || 1374 Subtarget->hasForced32BitAtomics()) { 1375 // Cortex-M (besides Cortex-M0) have 32-bit atomics. 1376 setMaxAtomicSizeInBitsSupported(32); 1377 } else { 1378 // We can't assume anything about other targets; just use libatomic 1379 // routines. 1380 setMaxAtomicSizeInBitsSupported(0); 1381 } 1382 1383 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1384 1385 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1386 if (!Subtarget->hasV6Ops()) { 1387 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1388 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1389 } 1390 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1391 1392 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && 1393 !Subtarget->isThumb1Only()) { 1394 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1395 // iff target supports vfp2. 1396 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1397 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1398 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); 1399 } 1400 1401 // We want to custom lower some of our intrinsics. 1402 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1403 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1404 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1405 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1406 if (Subtarget->useSjLjEH()) 1407 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1408 1409 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1410 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1411 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1412 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1413 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1414 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1415 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1416 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1417 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1418 if (Subtarget->hasFullFP16()) { 1419 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1420 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1421 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1422 } 1423 1424 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1425 1426 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1427 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1428 if (Subtarget->hasFullFP16()) 1429 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1430 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1431 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1432 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1433 1434 // We don't support sin/cos/fmod/copysign/pow 1435 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1436 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1437 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1438 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1439 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1440 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1441 setOperationAction(ISD::FREM, MVT::f64, Expand); 1442 setOperationAction(ISD::FREM, MVT::f32, Expand); 1443 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && 1444 !Subtarget->isThumb1Only()) { 1445 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1446 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1447 } 1448 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1449 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1450 1451 if (!Subtarget->hasVFP4Base()) { 1452 setOperationAction(ISD::FMA, MVT::f64, Expand); 1453 setOperationAction(ISD::FMA, MVT::f32, Expand); 1454 } 1455 1456 // Various VFP goodness 1457 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1458 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1459 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { 1460 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1461 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1462 } 1463 1464 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1465 if (!Subtarget->hasFP16()) { 1466 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1467 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1468 } 1469 1470 // Strict floating-point comparisons need custom lowering. 1471 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 1472 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 1473 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 1474 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 1475 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 1476 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 1477 } 1478 1479 // Use __sincos_stret if available. 1480 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1481 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1482 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1483 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1484 } 1485 1486 // FP-ARMv8 implements a lot of rounding-like FP operations. 1487 if (Subtarget->hasFPARMv8Base()) { 1488 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1489 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1490 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1491 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1492 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1493 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1494 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1495 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1496 if (Subtarget->hasNEON()) { 1497 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1498 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1499 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1500 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1501 } 1502 1503 if (Subtarget->hasFP64()) { 1504 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1505 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1506 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1507 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1508 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1509 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1510 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1511 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1512 } 1513 } 1514 1515 // FP16 often need to be promoted to call lib functions 1516 if (Subtarget->hasFullFP16()) { 1517 setOperationAction(ISD::FREM, MVT::f16, Promote); 1518 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 1519 setOperationAction(ISD::FSIN, MVT::f16, Promote); 1520 setOperationAction(ISD::FCOS, MVT::f16, Promote); 1521 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 1522 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 1523 setOperationAction(ISD::FPOW, MVT::f16, Promote); 1524 setOperationAction(ISD::FEXP, MVT::f16, Promote); 1525 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 1526 setOperationAction(ISD::FLOG, MVT::f16, Promote); 1527 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 1528 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 1529 1530 setOperationAction(ISD::FROUND, MVT::f16, Legal); 1531 } 1532 1533 if (Subtarget->hasNEON()) { 1534 // vmin and vmax aren't available in a scalar form, so we can use 1535 // a NEON instruction with an undef lane instead. This has a performance 1536 // penalty on some cores, so we don't do this unless we have been 1537 // asked to by the core tuning model. 1538 if (Subtarget->useNEONForSinglePrecisionFP()) { 1539 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1540 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1541 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1542 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1543 } 1544 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1545 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1546 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1547 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1548 1549 if (Subtarget->hasFullFP16()) { 1550 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1551 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1552 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1553 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1554 1555 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1556 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1557 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1558 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1559 } 1560 } 1561 1562 // We have target-specific dag combine patterns for the following nodes: 1563 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1564 setTargetDAGCombine( 1565 {ISD::ADD, ISD::SUB, ISD::MUL, ISD::AND, ISD::OR, ISD::XOR}); 1566 1567 if (Subtarget->hasMVEIntegerOps()) 1568 setTargetDAGCombine(ISD::VSELECT); 1569 1570 if (Subtarget->hasV6Ops()) 1571 setTargetDAGCombine(ISD::SRL); 1572 if (Subtarget->isThumb1Only()) 1573 setTargetDAGCombine(ISD::SHL); 1574 // Attempt to lower smin/smax to ssat/usat 1575 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || 1576 Subtarget->isThumb2()) { 1577 setTargetDAGCombine({ISD::SMIN, ISD::SMAX}); 1578 } 1579 1580 setStackPointerRegisterToSaveRestore(ARM::SP); 1581 1582 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1583 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) 1584 setSchedulingPreference(Sched::RegPressure); 1585 else 1586 setSchedulingPreference(Sched::Hybrid); 1587 1588 //// temporary - rewrite interface to use type 1589 MaxStoresPerMemset = 8; 1590 MaxStoresPerMemsetOptSize = 4; 1591 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1592 MaxStoresPerMemcpyOptSize = 2; 1593 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1594 MaxStoresPerMemmoveOptSize = 2; 1595 1596 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1597 // are at least 4 bytes aligned. 1598 setMinStackArgumentAlignment(Align(4)); 1599 1600 // Prefer likely predicted branches to selects on out-of-order cores. 1601 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1602 1603 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); 1604 1605 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); 1606 1607 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1608 setTargetDAGCombine(ISD::ABS); 1609 } 1610 1611 bool ARMTargetLowering::useSoftFloat() const { 1612 return Subtarget->useSoftFloat(); 1613 } 1614 1615 // FIXME: It might make sense to define the representative register class as the 1616 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1617 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1618 // SPR's representative would be DPR_VFP2. This should work well if register 1619 // pressure tracking were modified such that a register use would increment the 1620 // pressure of the register class's representative and all of it's super 1621 // classes' representatives transitively. We have not implemented this because 1622 // of the difficulty prior to coalescing of modeling operand register classes 1623 // due to the common occurrence of cross class copies and subregister insertions 1624 // and extractions. 1625 std::pair<const TargetRegisterClass *, uint8_t> 1626 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1627 MVT VT) const { 1628 const TargetRegisterClass *RRC = nullptr; 1629 uint8_t Cost = 1; 1630 switch (VT.SimpleTy) { 1631 default: 1632 return TargetLowering::findRepresentativeClass(TRI, VT); 1633 // Use DPR as representative register class for all floating point 1634 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1635 // the cost is 1 for both f32 and f64. 1636 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1637 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1638 RRC = &ARM::DPRRegClass; 1639 // When NEON is used for SP, only half of the register file is available 1640 // because operations that define both SP and DP results will be constrained 1641 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1642 // coalescing by double-counting the SP regs. See the FIXME above. 1643 if (Subtarget->useNEONForSinglePrecisionFP()) 1644 Cost = 2; 1645 break; 1646 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1647 case MVT::v4f32: case MVT::v2f64: 1648 RRC = &ARM::DPRRegClass; 1649 Cost = 2; 1650 break; 1651 case MVT::v4i64: 1652 RRC = &ARM::DPRRegClass; 1653 Cost = 4; 1654 break; 1655 case MVT::v8i64: 1656 RRC = &ARM::DPRRegClass; 1657 Cost = 8; 1658 break; 1659 } 1660 return std::make_pair(RRC, Cost); 1661 } 1662 1663 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1664 #define MAKE_CASE(V) \ 1665 case V: \ 1666 return #V; 1667 switch ((ARMISD::NodeType)Opcode) { 1668 case ARMISD::FIRST_NUMBER: 1669 break; 1670 MAKE_CASE(ARMISD::Wrapper) 1671 MAKE_CASE(ARMISD::WrapperPIC) 1672 MAKE_CASE(ARMISD::WrapperJT) 1673 MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL) 1674 MAKE_CASE(ARMISD::CALL) 1675 MAKE_CASE(ARMISD::CALL_PRED) 1676 MAKE_CASE(ARMISD::CALL_NOLINK) 1677 MAKE_CASE(ARMISD::tSECALL) 1678 MAKE_CASE(ARMISD::t2CALL_BTI) 1679 MAKE_CASE(ARMISD::BRCOND) 1680 MAKE_CASE(ARMISD::BR_JT) 1681 MAKE_CASE(ARMISD::BR2_JT) 1682 MAKE_CASE(ARMISD::RET_FLAG) 1683 MAKE_CASE(ARMISD::SERET_FLAG) 1684 MAKE_CASE(ARMISD::INTRET_FLAG) 1685 MAKE_CASE(ARMISD::PIC_ADD) 1686 MAKE_CASE(ARMISD::CMP) 1687 MAKE_CASE(ARMISD::CMN) 1688 MAKE_CASE(ARMISD::CMPZ) 1689 MAKE_CASE(ARMISD::CMPFP) 1690 MAKE_CASE(ARMISD::CMPFPE) 1691 MAKE_CASE(ARMISD::CMPFPw0) 1692 MAKE_CASE(ARMISD::CMPFPEw0) 1693 MAKE_CASE(ARMISD::BCC_i64) 1694 MAKE_CASE(ARMISD::FMSTAT) 1695 MAKE_CASE(ARMISD::CMOV) 1696 MAKE_CASE(ARMISD::SUBS) 1697 MAKE_CASE(ARMISD::SSAT) 1698 MAKE_CASE(ARMISD::USAT) 1699 MAKE_CASE(ARMISD::ASRL) 1700 MAKE_CASE(ARMISD::LSRL) 1701 MAKE_CASE(ARMISD::LSLL) 1702 MAKE_CASE(ARMISD::SRL_FLAG) 1703 MAKE_CASE(ARMISD::SRA_FLAG) 1704 MAKE_CASE(ARMISD::RRX) 1705 MAKE_CASE(ARMISD::ADDC) 1706 MAKE_CASE(ARMISD::ADDE) 1707 MAKE_CASE(ARMISD::SUBC) 1708 MAKE_CASE(ARMISD::SUBE) 1709 MAKE_CASE(ARMISD::LSLS) 1710 MAKE_CASE(ARMISD::VMOVRRD) 1711 MAKE_CASE(ARMISD::VMOVDRR) 1712 MAKE_CASE(ARMISD::VMOVhr) 1713 MAKE_CASE(ARMISD::VMOVrh) 1714 MAKE_CASE(ARMISD::VMOVSR) 1715 MAKE_CASE(ARMISD::EH_SJLJ_SETJMP) 1716 MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP) 1717 MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH) 1718 MAKE_CASE(ARMISD::TC_RETURN) 1719 MAKE_CASE(ARMISD::THREAD_POINTER) 1720 MAKE_CASE(ARMISD::DYN_ALLOC) 1721 MAKE_CASE(ARMISD::MEMBARRIER_MCR) 1722 MAKE_CASE(ARMISD::PRELOAD) 1723 MAKE_CASE(ARMISD::LDRD) 1724 MAKE_CASE(ARMISD::STRD) 1725 MAKE_CASE(ARMISD::WIN__CHKSTK) 1726 MAKE_CASE(ARMISD::WIN__DBZCHK) 1727 MAKE_CASE(ARMISD::PREDICATE_CAST) 1728 MAKE_CASE(ARMISD::VECTOR_REG_CAST) 1729 MAKE_CASE(ARMISD::MVESEXT) 1730 MAKE_CASE(ARMISD::MVEZEXT) 1731 MAKE_CASE(ARMISD::MVETRUNC) 1732 MAKE_CASE(ARMISD::VCMP) 1733 MAKE_CASE(ARMISD::VCMPZ) 1734 MAKE_CASE(ARMISD::VTST) 1735 MAKE_CASE(ARMISD::VSHLs) 1736 MAKE_CASE(ARMISD::VSHLu) 1737 MAKE_CASE(ARMISD::VSHLIMM) 1738 MAKE_CASE(ARMISD::VSHRsIMM) 1739 MAKE_CASE(ARMISD::VSHRuIMM) 1740 MAKE_CASE(ARMISD::VRSHRsIMM) 1741 MAKE_CASE(ARMISD::VRSHRuIMM) 1742 MAKE_CASE(ARMISD::VRSHRNIMM) 1743 MAKE_CASE(ARMISD::VQSHLsIMM) 1744 MAKE_CASE(ARMISD::VQSHLuIMM) 1745 MAKE_CASE(ARMISD::VQSHLsuIMM) 1746 MAKE_CASE(ARMISD::VQSHRNsIMM) 1747 MAKE_CASE(ARMISD::VQSHRNuIMM) 1748 MAKE_CASE(ARMISD::VQSHRNsuIMM) 1749 MAKE_CASE(ARMISD::VQRSHRNsIMM) 1750 MAKE_CASE(ARMISD::VQRSHRNuIMM) 1751 MAKE_CASE(ARMISD::VQRSHRNsuIMM) 1752 MAKE_CASE(ARMISD::VSLIIMM) 1753 MAKE_CASE(ARMISD::VSRIIMM) 1754 MAKE_CASE(ARMISD::VGETLANEu) 1755 MAKE_CASE(ARMISD::VGETLANEs) 1756 MAKE_CASE(ARMISD::VMOVIMM) 1757 MAKE_CASE(ARMISD::VMVNIMM) 1758 MAKE_CASE(ARMISD::VMOVFPIMM) 1759 MAKE_CASE(ARMISD::VDUP) 1760 MAKE_CASE(ARMISD::VDUPLANE) 1761 MAKE_CASE(ARMISD::VEXT) 1762 MAKE_CASE(ARMISD::VREV64) 1763 MAKE_CASE(ARMISD::VREV32) 1764 MAKE_CASE(ARMISD::VREV16) 1765 MAKE_CASE(ARMISD::VZIP) 1766 MAKE_CASE(ARMISD::VUZP) 1767 MAKE_CASE(ARMISD::VTRN) 1768 MAKE_CASE(ARMISD::VTBL1) 1769 MAKE_CASE(ARMISD::VTBL2) 1770 MAKE_CASE(ARMISD::VMOVN) 1771 MAKE_CASE(ARMISD::VQMOVNs) 1772 MAKE_CASE(ARMISD::VQMOVNu) 1773 MAKE_CASE(ARMISD::VCVTN) 1774 MAKE_CASE(ARMISD::VCVTL) 1775 MAKE_CASE(ARMISD::VIDUP) 1776 MAKE_CASE(ARMISD::VMULLs) 1777 MAKE_CASE(ARMISD::VMULLu) 1778 MAKE_CASE(ARMISD::VQDMULH) 1779 MAKE_CASE(ARMISD::VADDVs) 1780 MAKE_CASE(ARMISD::VADDVu) 1781 MAKE_CASE(ARMISD::VADDVps) 1782 MAKE_CASE(ARMISD::VADDVpu) 1783 MAKE_CASE(ARMISD::VADDLVs) 1784 MAKE_CASE(ARMISD::VADDLVu) 1785 MAKE_CASE(ARMISD::VADDLVAs) 1786 MAKE_CASE(ARMISD::VADDLVAu) 1787 MAKE_CASE(ARMISD::VADDLVps) 1788 MAKE_CASE(ARMISD::VADDLVpu) 1789 MAKE_CASE(ARMISD::VADDLVAps) 1790 MAKE_CASE(ARMISD::VADDLVApu) 1791 MAKE_CASE(ARMISD::VMLAVs) 1792 MAKE_CASE(ARMISD::VMLAVu) 1793 MAKE_CASE(ARMISD::VMLAVps) 1794 MAKE_CASE(ARMISD::VMLAVpu) 1795 MAKE_CASE(ARMISD::VMLALVs) 1796 MAKE_CASE(ARMISD::VMLALVu) 1797 MAKE_CASE(ARMISD::VMLALVps) 1798 MAKE_CASE(ARMISD::VMLALVpu) 1799 MAKE_CASE(ARMISD::VMLALVAs) 1800 MAKE_CASE(ARMISD::VMLALVAu) 1801 MAKE_CASE(ARMISD::VMLALVAps) 1802 MAKE_CASE(ARMISD::VMLALVApu) 1803 MAKE_CASE(ARMISD::VMINVu) 1804 MAKE_CASE(ARMISD::VMINVs) 1805 MAKE_CASE(ARMISD::VMAXVu) 1806 MAKE_CASE(ARMISD::VMAXVs) 1807 MAKE_CASE(ARMISD::UMAAL) 1808 MAKE_CASE(ARMISD::UMLAL) 1809 MAKE_CASE(ARMISD::SMLAL) 1810 MAKE_CASE(ARMISD::SMLALBB) 1811 MAKE_CASE(ARMISD::SMLALBT) 1812 MAKE_CASE(ARMISD::SMLALTB) 1813 MAKE_CASE(ARMISD::SMLALTT) 1814 MAKE_CASE(ARMISD::SMULWB) 1815 MAKE_CASE(ARMISD::SMULWT) 1816 MAKE_CASE(ARMISD::SMLALD) 1817 MAKE_CASE(ARMISD::SMLALDX) 1818 MAKE_CASE(ARMISD::SMLSLD) 1819 MAKE_CASE(ARMISD::SMLSLDX) 1820 MAKE_CASE(ARMISD::SMMLAR) 1821 MAKE_CASE(ARMISD::SMMLSR) 1822 MAKE_CASE(ARMISD::QADD16b) 1823 MAKE_CASE(ARMISD::QSUB16b) 1824 MAKE_CASE(ARMISD::QADD8b) 1825 MAKE_CASE(ARMISD::QSUB8b) 1826 MAKE_CASE(ARMISD::UQADD16b) 1827 MAKE_CASE(ARMISD::UQSUB16b) 1828 MAKE_CASE(ARMISD::UQADD8b) 1829 MAKE_CASE(ARMISD::UQSUB8b) 1830 MAKE_CASE(ARMISD::BUILD_VECTOR) 1831 MAKE_CASE(ARMISD::BFI) 1832 MAKE_CASE(ARMISD::VORRIMM) 1833 MAKE_CASE(ARMISD::VBICIMM) 1834 MAKE_CASE(ARMISD::VBSP) 1835 MAKE_CASE(ARMISD::MEMCPY) 1836 MAKE_CASE(ARMISD::VLD1DUP) 1837 MAKE_CASE(ARMISD::VLD2DUP) 1838 MAKE_CASE(ARMISD::VLD3DUP) 1839 MAKE_CASE(ARMISD::VLD4DUP) 1840 MAKE_CASE(ARMISD::VLD1_UPD) 1841 MAKE_CASE(ARMISD::VLD2_UPD) 1842 MAKE_CASE(ARMISD::VLD3_UPD) 1843 MAKE_CASE(ARMISD::VLD4_UPD) 1844 MAKE_CASE(ARMISD::VLD1x2_UPD) 1845 MAKE_CASE(ARMISD::VLD1x3_UPD) 1846 MAKE_CASE(ARMISD::VLD1x4_UPD) 1847 MAKE_CASE(ARMISD::VLD2LN_UPD) 1848 MAKE_CASE(ARMISD::VLD3LN_UPD) 1849 MAKE_CASE(ARMISD::VLD4LN_UPD) 1850 MAKE_CASE(ARMISD::VLD1DUP_UPD) 1851 MAKE_CASE(ARMISD::VLD2DUP_UPD) 1852 MAKE_CASE(ARMISD::VLD3DUP_UPD) 1853 MAKE_CASE(ARMISD::VLD4DUP_UPD) 1854 MAKE_CASE(ARMISD::VST1_UPD) 1855 MAKE_CASE(ARMISD::VST2_UPD) 1856 MAKE_CASE(ARMISD::VST3_UPD) 1857 MAKE_CASE(ARMISD::VST4_UPD) 1858 MAKE_CASE(ARMISD::VST1x2_UPD) 1859 MAKE_CASE(ARMISD::VST1x3_UPD) 1860 MAKE_CASE(ARMISD::VST1x4_UPD) 1861 MAKE_CASE(ARMISD::VST2LN_UPD) 1862 MAKE_CASE(ARMISD::VST3LN_UPD) 1863 MAKE_CASE(ARMISD::VST4LN_UPD) 1864 MAKE_CASE(ARMISD::WLS) 1865 MAKE_CASE(ARMISD::WLSSETUP) 1866 MAKE_CASE(ARMISD::LE) 1867 MAKE_CASE(ARMISD::LOOP_DEC) 1868 MAKE_CASE(ARMISD::CSINV) 1869 MAKE_CASE(ARMISD::CSNEG) 1870 MAKE_CASE(ARMISD::CSINC) 1871 MAKE_CASE(ARMISD::MEMCPYLOOP) 1872 MAKE_CASE(ARMISD::MEMSETLOOP) 1873 #undef MAKE_CASE 1874 } 1875 return nullptr; 1876 } 1877 1878 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1879 EVT VT) const { 1880 if (!VT.isVector()) 1881 return getPointerTy(DL); 1882 1883 // MVE has a predicate register. 1884 if ((Subtarget->hasMVEIntegerOps() && 1885 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 1886 VT == MVT::v16i8)) || 1887 (Subtarget->hasMVEFloatOps() && 1888 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16))) 1889 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); 1890 return VT.changeVectorElementTypeToInteger(); 1891 } 1892 1893 /// getRegClassFor - Return the register class that should be used for the 1894 /// specified value type. 1895 const TargetRegisterClass * 1896 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 1897 (void)isDivergent; 1898 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1899 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1900 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive 1901 // MVE Q registers. 1902 if (Subtarget->hasNEON()) { 1903 if (VT == MVT::v4i64) 1904 return &ARM::QQPRRegClass; 1905 if (VT == MVT::v8i64) 1906 return &ARM::QQQQPRRegClass; 1907 } 1908 if (Subtarget->hasMVEIntegerOps()) { 1909 if (VT == MVT::v4i64) 1910 return &ARM::MQQPRRegClass; 1911 if (VT == MVT::v8i64) 1912 return &ARM::MQQQQPRRegClass; 1913 } 1914 return TargetLowering::getRegClassFor(VT); 1915 } 1916 1917 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1918 // source/dest is aligned and the copy size is large enough. We therefore want 1919 // to align such objects passed to memory intrinsics. 1920 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1921 Align &PrefAlign) const { 1922 if (!isa<MemIntrinsic>(CI)) 1923 return false; 1924 MinSize = 8; 1925 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1926 // cycle faster than 4-byte aligned LDM. 1927 PrefAlign = 1928 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4)); 1929 return true; 1930 } 1931 1932 // Create a fast isel object. 1933 FastISel * 1934 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1935 const TargetLibraryInfo *libInfo) const { 1936 return ARM::createFastISel(funcInfo, libInfo); 1937 } 1938 1939 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1940 unsigned NumVals = N->getNumValues(); 1941 if (!NumVals) 1942 return Sched::RegPressure; 1943 1944 for (unsigned i = 0; i != NumVals; ++i) { 1945 EVT VT = N->getValueType(i); 1946 if (VT == MVT::Glue || VT == MVT::Other) 1947 continue; 1948 if (VT.isFloatingPoint() || VT.isVector()) 1949 return Sched::ILP; 1950 } 1951 1952 if (!N->isMachineOpcode()) 1953 return Sched::RegPressure; 1954 1955 // Load are scheduled for latency even if there instruction itinerary 1956 // is not available. 1957 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1958 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1959 1960 if (MCID.getNumDefs() == 0) 1961 return Sched::RegPressure; 1962 if (!Itins->isEmpty() && 1963 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1964 return Sched::ILP; 1965 1966 return Sched::RegPressure; 1967 } 1968 1969 //===----------------------------------------------------------------------===// 1970 // Lowering Code 1971 //===----------------------------------------------------------------------===// 1972 1973 static bool isSRL16(const SDValue &Op) { 1974 if (Op.getOpcode() != ISD::SRL) 1975 return false; 1976 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1977 return Const->getZExtValue() == 16; 1978 return false; 1979 } 1980 1981 static bool isSRA16(const SDValue &Op) { 1982 if (Op.getOpcode() != ISD::SRA) 1983 return false; 1984 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1985 return Const->getZExtValue() == 16; 1986 return false; 1987 } 1988 1989 static bool isSHL16(const SDValue &Op) { 1990 if (Op.getOpcode() != ISD::SHL) 1991 return false; 1992 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1993 return Const->getZExtValue() == 16; 1994 return false; 1995 } 1996 1997 // Check for a signed 16-bit value. We special case SRA because it makes it 1998 // more simple when also looking for SRAs that aren't sign extending a 1999 // smaller value. Without the check, we'd need to take extra care with 2000 // checking order for some operations. 2001 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 2002 if (isSRA16(Op)) 2003 return isSHL16(Op.getOperand(0)); 2004 return DAG.ComputeNumSignBits(Op) == 17; 2005 } 2006 2007 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 2008 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 2009 switch (CC) { 2010 default: llvm_unreachable("Unknown condition code!"); 2011 case ISD::SETNE: return ARMCC::NE; 2012 case ISD::SETEQ: return ARMCC::EQ; 2013 case ISD::SETGT: return ARMCC::GT; 2014 case ISD::SETGE: return ARMCC::GE; 2015 case ISD::SETLT: return ARMCC::LT; 2016 case ISD::SETLE: return ARMCC::LE; 2017 case ISD::SETUGT: return ARMCC::HI; 2018 case ISD::SETUGE: return ARMCC::HS; 2019 case ISD::SETULT: return ARMCC::LO; 2020 case ISD::SETULE: return ARMCC::LS; 2021 } 2022 } 2023 2024 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 2025 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 2026 ARMCC::CondCodes &CondCode2) { 2027 CondCode2 = ARMCC::AL; 2028 switch (CC) { 2029 default: llvm_unreachable("Unknown FP condition!"); 2030 case ISD::SETEQ: 2031 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 2032 case ISD::SETGT: 2033 case ISD::SETOGT: CondCode = ARMCC::GT; break; 2034 case ISD::SETGE: 2035 case ISD::SETOGE: CondCode = ARMCC::GE; break; 2036 case ISD::SETOLT: CondCode = ARMCC::MI; break; 2037 case ISD::SETOLE: CondCode = ARMCC::LS; break; 2038 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 2039 case ISD::SETO: CondCode = ARMCC::VC; break; 2040 case ISD::SETUO: CondCode = ARMCC::VS; break; 2041 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 2042 case ISD::SETUGT: CondCode = ARMCC::HI; break; 2043 case ISD::SETUGE: CondCode = ARMCC::PL; break; 2044 case ISD::SETLT: 2045 case ISD::SETULT: CondCode = ARMCC::LT; break; 2046 case ISD::SETLE: 2047 case ISD::SETULE: CondCode = ARMCC::LE; break; 2048 case ISD::SETNE: 2049 case ISD::SETUNE: CondCode = ARMCC::NE; break; 2050 } 2051 } 2052 2053 //===----------------------------------------------------------------------===// 2054 // Calling Convention Implementation 2055 //===----------------------------------------------------------------------===// 2056 2057 /// getEffectiveCallingConv - Get the effective calling convention, taking into 2058 /// account presence of floating point hardware and calling convention 2059 /// limitations, such as support for variadic functions. 2060 CallingConv::ID 2061 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 2062 bool isVarArg) const { 2063 switch (CC) { 2064 default: 2065 report_fatal_error("Unsupported calling convention"); 2066 case CallingConv::ARM_AAPCS: 2067 case CallingConv::ARM_APCS: 2068 case CallingConv::GHC: 2069 case CallingConv::CFGuard_Check: 2070 return CC; 2071 case CallingConv::PreserveMost: 2072 return CallingConv::PreserveMost; 2073 case CallingConv::ARM_AAPCS_VFP: 2074 case CallingConv::Swift: 2075 case CallingConv::SwiftTail: 2076 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 2077 case CallingConv::C: 2078 case CallingConv::Tail: 2079 if (!Subtarget->isAAPCS_ABI()) 2080 return CallingConv::ARM_APCS; 2081 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && 2082 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 2083 !isVarArg) 2084 return CallingConv::ARM_AAPCS_VFP; 2085 else 2086 return CallingConv::ARM_AAPCS; 2087 case CallingConv::Fast: 2088 case CallingConv::CXX_FAST_TLS: 2089 if (!Subtarget->isAAPCS_ABI()) { 2090 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) 2091 return CallingConv::Fast; 2092 return CallingConv::ARM_APCS; 2093 } else if (Subtarget->hasVFP2Base() && 2094 !Subtarget->isThumb1Only() && !isVarArg) 2095 return CallingConv::ARM_AAPCS_VFP; 2096 else 2097 return CallingConv::ARM_AAPCS; 2098 } 2099 } 2100 2101 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 2102 bool isVarArg) const { 2103 return CCAssignFnForNode(CC, false, isVarArg); 2104 } 2105 2106 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 2107 bool isVarArg) const { 2108 return CCAssignFnForNode(CC, true, isVarArg); 2109 } 2110 2111 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 2112 /// CallingConvention. 2113 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 2114 bool Return, 2115 bool isVarArg) const { 2116 switch (getEffectiveCallingConv(CC, isVarArg)) { 2117 default: 2118 report_fatal_error("Unsupported calling convention"); 2119 case CallingConv::ARM_APCS: 2120 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 2121 case CallingConv::ARM_AAPCS: 2122 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 2123 case CallingConv::ARM_AAPCS_VFP: 2124 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 2125 case CallingConv::Fast: 2126 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 2127 case CallingConv::GHC: 2128 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 2129 case CallingConv::PreserveMost: 2130 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 2131 case CallingConv::CFGuard_Check: 2132 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); 2133 } 2134 } 2135 2136 SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, 2137 MVT LocVT, MVT ValVT, SDValue Val) const { 2138 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()), 2139 Val); 2140 if (Subtarget->hasFullFP16()) { 2141 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val); 2142 } else { 2143 Val = DAG.getNode(ISD::TRUNCATE, dl, 2144 MVT::getIntegerVT(ValVT.getSizeInBits()), Val); 2145 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val); 2146 } 2147 return Val; 2148 } 2149 2150 SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, 2151 MVT LocVT, MVT ValVT, 2152 SDValue Val) const { 2153 if (Subtarget->hasFullFP16()) { 2154 Val = DAG.getNode(ARMISD::VMOVrh, dl, 2155 MVT::getIntegerVT(LocVT.getSizeInBits()), Val); 2156 } else { 2157 Val = DAG.getNode(ISD::BITCAST, dl, 2158 MVT::getIntegerVT(ValVT.getSizeInBits()), Val); 2159 Val = DAG.getNode(ISD::ZERO_EXTEND, dl, 2160 MVT::getIntegerVT(LocVT.getSizeInBits()), Val); 2161 } 2162 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val); 2163 } 2164 2165 /// LowerCallResult - Lower the result values of a call into the 2166 /// appropriate copies out of appropriate physical registers. 2167 SDValue ARMTargetLowering::LowerCallResult( 2168 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2169 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2170 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2171 SDValue ThisVal) const { 2172 // Assign locations to each value returned by this call. 2173 SmallVector<CCValAssign, 16> RVLocs; 2174 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2175 *DAG.getContext()); 2176 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 2177 2178 // Copy all of the result registers out of their specified physreg. 2179 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2180 CCValAssign VA = RVLocs[i]; 2181 2182 // Pass 'this' value directly from the argument to return value, to avoid 2183 // reg unit interference 2184 if (i == 0 && isThisReturn) { 2185 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 2186 "unexpected return calling convention register assignment"); 2187 InVals.push_back(ThisVal); 2188 continue; 2189 } 2190 2191 SDValue Val; 2192 if (VA.needsCustom() && 2193 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) { 2194 // Handle f64 or half of a v2f64. 2195 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2196 InFlag); 2197 Chain = Lo.getValue(1); 2198 InFlag = Lo.getValue(2); 2199 VA = RVLocs[++i]; // skip ahead to next loc 2200 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2201 InFlag); 2202 Chain = Hi.getValue(1); 2203 InFlag = Hi.getValue(2); 2204 if (!Subtarget->isLittle()) 2205 std::swap (Lo, Hi); 2206 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2207 2208 if (VA.getLocVT() == MVT::v2f64) { 2209 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 2210 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2211 DAG.getConstant(0, dl, MVT::i32)); 2212 2213 VA = RVLocs[++i]; // skip ahead to next loc 2214 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2215 Chain = Lo.getValue(1); 2216 InFlag = Lo.getValue(2); 2217 VA = RVLocs[++i]; // skip ahead to next loc 2218 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2219 Chain = Hi.getValue(1); 2220 InFlag = Hi.getValue(2); 2221 if (!Subtarget->isLittle()) 2222 std::swap (Lo, Hi); 2223 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2224 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2225 DAG.getConstant(1, dl, MVT::i32)); 2226 } 2227 } else { 2228 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 2229 InFlag); 2230 Chain = Val.getValue(1); 2231 InFlag = Val.getValue(2); 2232 } 2233 2234 switch (VA.getLocInfo()) { 2235 default: llvm_unreachable("Unknown loc info!"); 2236 case CCValAssign::Full: break; 2237 case CCValAssign::BCvt: 2238 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 2239 break; 2240 } 2241 2242 // f16 arguments have their size extended to 4 bytes and passed as if they 2243 // had been copied to the LSBs of a 32-bit register. 2244 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 2245 if (VA.needsCustom() && 2246 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) 2247 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val); 2248 2249 InVals.push_back(Val); 2250 } 2251 2252 return Chain; 2253 } 2254 2255 std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg( 2256 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr, 2257 bool IsTailCall, int SPDiff) const { 2258 SDValue DstAddr; 2259 MachinePointerInfo DstInfo; 2260 int32_t Offset = VA.getLocMemOffset(); 2261 MachineFunction &MF = DAG.getMachineFunction(); 2262 2263 if (IsTailCall) { 2264 Offset += SPDiff; 2265 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2266 int Size = VA.getLocVT().getFixedSizeInBits() / 8; 2267 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); 2268 DstAddr = DAG.getFrameIndex(FI, PtrVT); 2269 DstInfo = 2270 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 2271 } else { 2272 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl); 2273 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2274 StackPtr, PtrOff); 2275 DstInfo = 2276 MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset); 2277 } 2278 2279 return std::make_pair(DstAddr, DstInfo); 2280 } 2281 2282 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 2283 SDValue Chain, SDValue &Arg, 2284 RegsToPassVector &RegsToPass, 2285 CCValAssign &VA, CCValAssign &NextVA, 2286 SDValue &StackPtr, 2287 SmallVectorImpl<SDValue> &MemOpChains, 2288 bool IsTailCall, 2289 int SPDiff) const { 2290 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2291 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2292 unsigned id = Subtarget->isLittle() ? 0 : 1; 2293 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 2294 2295 if (NextVA.isRegLoc()) 2296 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 2297 else { 2298 assert(NextVA.isMemLoc()); 2299 if (!StackPtr.getNode()) 2300 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 2301 getPointerTy(DAG.getDataLayout())); 2302 2303 SDValue DstAddr; 2304 MachinePointerInfo DstInfo; 2305 std::tie(DstAddr, DstInfo) = 2306 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff); 2307 MemOpChains.push_back( 2308 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo)); 2309 } 2310 } 2311 2312 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { 2313 return (CC == CallingConv::Fast && GuaranteeTailCalls) || 2314 CC == CallingConv::Tail || CC == CallingConv::SwiftTail; 2315 } 2316 2317 /// LowerCall - Lowering a call into a callseq_start <- 2318 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 2319 /// nodes. 2320 SDValue 2321 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2322 SmallVectorImpl<SDValue> &InVals) const { 2323 SelectionDAG &DAG = CLI.DAG; 2324 SDLoc &dl = CLI.DL; 2325 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2326 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2327 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2328 SDValue Chain = CLI.Chain; 2329 SDValue Callee = CLI.Callee; 2330 bool &isTailCall = CLI.IsTailCall; 2331 CallingConv::ID CallConv = CLI.CallConv; 2332 bool doesNotRet = CLI.DoesNotReturn; 2333 bool isVarArg = CLI.IsVarArg; 2334 2335 MachineFunction &MF = DAG.getMachineFunction(); 2336 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2337 MachineFunction::CallSiteInfo CSInfo; 2338 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2339 bool isThisReturn = false; 2340 bool isCmseNSCall = false; 2341 bool isSibCall = false; 2342 bool PreferIndirect = false; 2343 bool GuardWithBTI = false; 2344 2345 // Lower 'returns_twice' calls to a pseudo-instruction. 2346 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) && 2347 !Subtarget->noBTIAtReturnTwice()) 2348 GuardWithBTI = AFI->branchTargetEnforcement(); 2349 2350 // Determine whether this is a non-secure function call. 2351 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call")) 2352 isCmseNSCall = true; 2353 2354 // Disable tail calls if they're not supported. 2355 if (!Subtarget->supportsTailCall()) 2356 isTailCall = false; 2357 2358 // For both the non-secure calls and the returns from a CMSE entry function, 2359 // the function needs to do some extra work afte r the call, or before the 2360 // return, respectively, thus it cannot end with atail call 2361 if (isCmseNSCall || AFI->isCmseNSEntryFunction()) 2362 isTailCall = false; 2363 2364 if (isa<GlobalAddressSDNode>(Callee)) { 2365 // If we're optimizing for minimum size and the function is called three or 2366 // more times in this block, we can improve codesize by calling indirectly 2367 // as BLXr has a 16-bit encoding. 2368 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2369 if (CLI.CB) { 2370 auto *BB = CLI.CB->getParent(); 2371 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && 2372 count_if(GV->users(), [&BB](const User *U) { 2373 return isa<Instruction>(U) && 2374 cast<Instruction>(U)->getParent() == BB; 2375 }) > 2; 2376 } 2377 } 2378 if (isTailCall) { 2379 // Check if it's really possible to do a tail call. 2380 isTailCall = IsEligibleForTailCallOptimization( 2381 Callee, CallConv, isVarArg, isStructRet, 2382 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, 2383 PreferIndirect); 2384 2385 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt && 2386 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail) 2387 isSibCall = true; 2388 2389 // We don't support GuaranteedTailCallOpt for ARM, only automatically 2390 // detected sibcalls. 2391 if (isTailCall) 2392 ++NumTailCalls; 2393 } 2394 2395 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall()) 2396 report_fatal_error("failed to perform tail call elimination on a call " 2397 "site marked musttail"); 2398 // Analyze operands of the call, assigning locations to each operand. 2399 SmallVector<CCValAssign, 16> ArgLocs; 2400 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2401 *DAG.getContext()); 2402 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 2403 2404 // Get a count of how many bytes are to be pushed on the stack. 2405 unsigned NumBytes = CCInfo.getNextStackOffset(); 2406 2407 // SPDiff is the byte offset of the call's argument area from the callee's. 2408 // Stores to callee stack arguments will be placed in FixedStackSlots offset 2409 // by this amount for a tail call. In a sibling call it must be 0 because the 2410 // caller will deallocate the entire stack and the callee still expects its 2411 // arguments to begin at SP+0. Completely unused for non-tail calls. 2412 int SPDiff = 0; 2413 2414 if (isTailCall && !isSibCall) { 2415 auto FuncInfo = MF.getInfo<ARMFunctionInfo>(); 2416 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize(); 2417 2418 // Since callee will pop argument stack as a tail call, we must keep the 2419 // popped size 16-byte aligned. 2420 Align StackAlign = DAG.getDataLayout().getStackAlignment(); 2421 NumBytes = alignTo(NumBytes, StackAlign); 2422 2423 // SPDiff will be negative if this tail call requires more space than we 2424 // would automatically have in our incoming argument space. Positive if we 2425 // can actually shrink the stack. 2426 SPDiff = NumReusableBytes - NumBytes; 2427 2428 // If this call requires more stack than we have available from 2429 // LowerFormalArguments, tell FrameLowering to reserve space for it. 2430 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff) 2431 AFI->setArgRegsSaveSize(-SPDiff); 2432 } 2433 2434 if (isSibCall) { 2435 // For sibling tail calls, memory operands are available in our caller's stack. 2436 NumBytes = 0; 2437 } else { 2438 // Adjust the stack pointer for the new arguments... 2439 // These operations are automatically eliminated by the prolog/epilog pass 2440 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl); 2441 } 2442 2443 SDValue StackPtr = 2444 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 2445 2446 RegsToPassVector RegsToPass; 2447 SmallVector<SDValue, 8> MemOpChains; 2448 2449 // During a tail call, stores to the argument area must happen after all of 2450 // the function's incoming arguments have been loaded because they may alias. 2451 // This is done by folding in a TokenFactor from LowerFormalArguments, but 2452 // there's no point in doing so repeatedly so this tracks whether that's 2453 // happened yet. 2454 bool AfterFormalArgLoads = false; 2455 2456 // Walk the register/memloc assignments, inserting copies/loads. In the case 2457 // of tail call optimization, arguments are handled later. 2458 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2459 i != e; 2460 ++i, ++realArgIdx) { 2461 CCValAssign &VA = ArgLocs[i]; 2462 SDValue Arg = OutVals[realArgIdx]; 2463 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2464 bool isByVal = Flags.isByVal(); 2465 2466 // Promote the value if needed. 2467 switch (VA.getLocInfo()) { 2468 default: llvm_unreachable("Unknown loc info!"); 2469 case CCValAssign::Full: break; 2470 case CCValAssign::SExt: 2471 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 2472 break; 2473 case CCValAssign::ZExt: 2474 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 2475 break; 2476 case CCValAssign::AExt: 2477 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 2478 break; 2479 case CCValAssign::BCvt: 2480 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2481 break; 2482 } 2483 2484 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) { 2485 Chain = DAG.getStackArgumentTokenFactor(Chain); 2486 AfterFormalArgLoads = true; 2487 } 2488 2489 // f16 arguments have their size extended to 4 bytes and passed as if they 2490 // had been copied to the LSBs of a 32-bit register. 2491 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 2492 if (VA.needsCustom() && 2493 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { 2494 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); 2495 } else { 2496 // f16 arguments could have been extended prior to argument lowering. 2497 // Mask them arguments if this is a CMSE nonsecure call. 2498 auto ArgVT = Outs[realArgIdx].ArgVT; 2499 if (isCmseNSCall && (ArgVT == MVT::f16)) { 2500 auto LocBits = VA.getLocVT().getSizeInBits(); 2501 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits()); 2502 SDValue Mask = 2503 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); 2504 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); 2505 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); 2506 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2507 } 2508 } 2509 2510 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 2511 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { 2512 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2513 DAG.getConstant(0, dl, MVT::i32)); 2514 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2515 DAG.getConstant(1, dl, MVT::i32)); 2516 2517 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], 2518 StackPtr, MemOpChains, isTailCall, SPDiff); 2519 2520 VA = ArgLocs[++i]; // skip ahead to next loc 2521 if (VA.isRegLoc()) { 2522 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], 2523 StackPtr, MemOpChains, isTailCall, SPDiff); 2524 } else { 2525 assert(VA.isMemLoc()); 2526 SDValue DstAddr; 2527 MachinePointerInfo DstInfo; 2528 std::tie(DstAddr, DstInfo) = 2529 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); 2530 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo)); 2531 } 2532 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { 2533 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 2534 StackPtr, MemOpChains, isTailCall, SPDiff); 2535 } else if (VA.isRegLoc()) { 2536 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 2537 Outs[0].VT == MVT::i32) { 2538 assert(VA.getLocVT() == MVT::i32 && 2539 "unexpected calling convention register assignment"); 2540 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 2541 "unexpected use of 'returned'"); 2542 isThisReturn = true; 2543 } 2544 const TargetOptions &Options = DAG.getTarget().Options; 2545 if (Options.EmitCallSiteInfo) 2546 CSInfo.emplace_back(VA.getLocReg(), i); 2547 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2548 } else if (isByVal) { 2549 assert(VA.isMemLoc()); 2550 unsigned offset = 0; 2551 2552 // True if this byval aggregate will be split between registers 2553 // and memory. 2554 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 2555 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 2556 2557 if (CurByValIdx < ByValArgsCount) { 2558 2559 unsigned RegBegin, RegEnd; 2560 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 2561 2562 EVT PtrVT = 2563 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2564 unsigned int i, j; 2565 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 2566 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 2567 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 2568 SDValue Load = 2569 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), 2570 DAG.InferPtrAlign(AddArg)); 2571 MemOpChains.push_back(Load.getValue(1)); 2572 RegsToPass.push_back(std::make_pair(j, Load)); 2573 } 2574 2575 // If parameter size outsides register area, "offset" value 2576 // helps us to calculate stack slot for remained part properly. 2577 offset = RegEnd - RegBegin; 2578 2579 CCInfo.nextInRegsParam(); 2580 } 2581 2582 if (Flags.getByValSize() > 4*offset) { 2583 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2584 SDValue Dst; 2585 MachinePointerInfo DstInfo; 2586 std::tie(Dst, DstInfo) = 2587 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); 2588 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 2589 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 2590 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 2591 MVT::i32); 2592 SDValue AlignNode = 2593 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); 2594 2595 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 2596 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 2597 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 2598 Ops)); 2599 } 2600 } else { 2601 assert(VA.isMemLoc()); 2602 SDValue DstAddr; 2603 MachinePointerInfo DstInfo; 2604 std::tie(DstAddr, DstInfo) = 2605 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); 2606 2607 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo); 2608 MemOpChains.push_back(Store); 2609 } 2610 } 2611 2612 if (!MemOpChains.empty()) 2613 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2614 2615 // Build a sequence of copy-to-reg nodes chained together with token chain 2616 // and flag operands which copy the outgoing args into the appropriate regs. 2617 SDValue InFlag; 2618 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2619 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2620 RegsToPass[i].second, InFlag); 2621 InFlag = Chain.getValue(1); 2622 } 2623 2624 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2625 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2626 // node so that legalize doesn't hack it. 2627 bool isDirect = false; 2628 2629 const TargetMachine &TM = getTargetMachine(); 2630 const Module *Mod = MF.getFunction().getParent(); 2631 const GlobalValue *GV = nullptr; 2632 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2633 GV = G->getGlobal(); 2634 bool isStub = 2635 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2636 2637 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2638 bool isLocalARMFunc = false; 2639 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2640 2641 if (Subtarget->genLongCalls()) { 2642 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2643 "long-calls codegen is not position independent!"); 2644 // Handle a global address or an external symbol. If it's not one of 2645 // those, the target's already in a register, so we don't need to do 2646 // anything extra. 2647 if (isa<GlobalAddressSDNode>(Callee)) { 2648 // Create a constant pool entry for the callee address 2649 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2650 ARMConstantPoolValue *CPV = 2651 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2652 2653 // Get the address of the callee into a register 2654 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2655 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2656 Callee = DAG.getLoad( 2657 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2658 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2659 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2660 const char *Sym = S->getSymbol(); 2661 2662 // Create a constant pool entry for the callee address 2663 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2664 ARMConstantPoolValue *CPV = 2665 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2666 ARMPCLabelIndex, 0); 2667 // Get the address of the callee into a register 2668 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2669 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2670 Callee = DAG.getLoad( 2671 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2672 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2673 } 2674 } else if (isa<GlobalAddressSDNode>(Callee)) { 2675 if (!PreferIndirect) { 2676 isDirect = true; 2677 bool isDef = GV->isStrongDefinitionForLinker(); 2678 2679 // ARM call to a local ARM function is predicable. 2680 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2681 // tBX takes a register source operand. 2682 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2683 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2684 Callee = DAG.getNode( 2685 ARMISD::WrapperPIC, dl, PtrVt, 2686 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2687 Callee = DAG.getLoad( 2688 PtrVt, dl, DAG.getEntryNode(), Callee, 2689 MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(), 2690 MachineMemOperand::MODereferenceable | 2691 MachineMemOperand::MOInvariant); 2692 } else if (Subtarget->isTargetCOFF()) { 2693 assert(Subtarget->isTargetWindows() && 2694 "Windows is the only supported COFF target"); 2695 unsigned TargetFlags = ARMII::MO_NO_FLAG; 2696 if (GV->hasDLLImportStorageClass()) 2697 TargetFlags = ARMII::MO_DLLIMPORT; 2698 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 2699 TargetFlags = ARMII::MO_COFFSTUB; 2700 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, 2701 TargetFlags); 2702 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 2703 Callee = 2704 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2705 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2706 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2707 } else { 2708 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2709 } 2710 } 2711 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2712 isDirect = true; 2713 // tBX takes a register source operand. 2714 const char *Sym = S->getSymbol(); 2715 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2716 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2717 ARMConstantPoolValue *CPV = 2718 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2719 ARMPCLabelIndex, 4); 2720 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2721 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2722 Callee = DAG.getLoad( 2723 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2724 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2725 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2726 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2727 } else { 2728 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2729 } 2730 } 2731 2732 if (isCmseNSCall) { 2733 assert(!isARMFunc && !isDirect && 2734 "Cannot handle call to ARM function or direct call"); 2735 if (NumBytes > 0) { 2736 DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(), 2737 "call to non-secure function would " 2738 "require passing arguments on stack", 2739 dl.getDebugLoc()); 2740 DAG.getContext()->diagnose(Diag); 2741 } 2742 if (isStructRet) { 2743 DiagnosticInfoUnsupported Diag( 2744 DAG.getMachineFunction().getFunction(), 2745 "call to non-secure function would return value through pointer", 2746 dl.getDebugLoc()); 2747 DAG.getContext()->diagnose(Diag); 2748 } 2749 } 2750 2751 // FIXME: handle tail calls differently. 2752 unsigned CallOpc; 2753 if (Subtarget->isThumb()) { 2754 if (GuardWithBTI) 2755 CallOpc = ARMISD::t2CALL_BTI; 2756 else if (isCmseNSCall) 2757 CallOpc = ARMISD::tSECALL; 2758 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2759 CallOpc = ARMISD::CALL_NOLINK; 2760 else 2761 CallOpc = ARMISD::CALL; 2762 } else { 2763 if (!isDirect && !Subtarget->hasV5TOps()) 2764 CallOpc = ARMISD::CALL_NOLINK; 2765 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2766 // Emit regular call when code size is the priority 2767 !Subtarget->hasMinSize()) 2768 // "mov lr, pc; b _foo" to avoid confusing the RSP 2769 CallOpc = ARMISD::CALL_NOLINK; 2770 else 2771 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2772 } 2773 2774 // We don't usually want to end the call-sequence here because we would tidy 2775 // the frame up *after* the call, however in the ABI-changing tail-call case 2776 // we've carefully laid out the parameters so that when sp is reset they'll be 2777 // in the correct location. 2778 if (isTailCall && !isSibCall) { 2779 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), 2780 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2781 InFlag = Chain.getValue(1); 2782 } 2783 2784 std::vector<SDValue> Ops; 2785 Ops.push_back(Chain); 2786 Ops.push_back(Callee); 2787 2788 if (isTailCall) { 2789 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32)); 2790 } 2791 2792 // Add argument registers to the end of the list so that they are known live 2793 // into the call. 2794 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2795 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2796 RegsToPass[i].second.getValueType())); 2797 2798 // Add a register mask operand representing the call-preserved registers. 2799 const uint32_t *Mask; 2800 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2801 if (isThisReturn) { 2802 // For 'this' returns, use the R0-preserving mask if applicable 2803 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2804 if (!Mask) { 2805 // Set isThisReturn to false if the calling convention is not one that 2806 // allows 'returned' to be modeled in this way, so LowerCallResult does 2807 // not try to pass 'this' straight through 2808 isThisReturn = false; 2809 Mask = ARI->getCallPreservedMask(MF, CallConv); 2810 } 2811 } else 2812 Mask = ARI->getCallPreservedMask(MF, CallConv); 2813 2814 assert(Mask && "Missing call preserved mask for calling convention"); 2815 Ops.push_back(DAG.getRegisterMask(Mask)); 2816 2817 if (InFlag.getNode()) 2818 Ops.push_back(InFlag); 2819 2820 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2821 if (isTailCall) { 2822 MF.getFrameInfo().setHasTailCall(); 2823 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2824 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2825 return Ret; 2826 } 2827 2828 // Returns a chain and a flag for retval copy to use. 2829 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2830 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 2831 InFlag = Chain.getValue(1); 2832 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2833 2834 // If we're guaranteeing tail-calls will be honoured, the callee must 2835 // pop its own argument stack on return. But this call is *not* a tail call so 2836 // we need to undo that after it returns to restore the status-quo. 2837 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; 2838 uint64_t CalleePopBytes = 2839 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL; 2840 2841 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2842 DAG.getIntPtrConstant(CalleePopBytes, dl, true), 2843 InFlag, dl); 2844 if (!Ins.empty()) 2845 InFlag = Chain.getValue(1); 2846 2847 // Handle result values, copying them out of physregs into vregs that we 2848 // return. 2849 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2850 InVals, isThisReturn, 2851 isThisReturn ? OutVals[0] : SDValue()); 2852 } 2853 2854 /// HandleByVal - Every parameter *after* a byval parameter is passed 2855 /// on the stack. Remember the next parameter register to allocate, 2856 /// and then confiscate the rest of the parameter registers to insure 2857 /// this. 2858 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2859 Align Alignment) const { 2860 // Byval (as with any stack) slots are always at least 4 byte aligned. 2861 Alignment = std::max(Alignment, Align(4)); 2862 2863 unsigned Reg = State->AllocateReg(GPRArgRegs); 2864 if (!Reg) 2865 return; 2866 2867 unsigned AlignInRegs = Alignment.value() / 4; 2868 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2869 for (unsigned i = 0; i < Waste; ++i) 2870 Reg = State->AllocateReg(GPRArgRegs); 2871 2872 if (!Reg) 2873 return; 2874 2875 unsigned Excess = 4 * (ARM::R4 - Reg); 2876 2877 // Special case when NSAA != SP and parameter size greater than size of 2878 // all remained GPR regs. In that case we can't split parameter, we must 2879 // send it to stack. We also must set NCRN to R4, so waste all 2880 // remained registers. 2881 const unsigned NSAAOffset = State->getNextStackOffset(); 2882 if (NSAAOffset != 0 && Size > Excess) { 2883 while (State->AllocateReg(GPRArgRegs)) 2884 ; 2885 return; 2886 } 2887 2888 // First register for byval parameter is the first register that wasn't 2889 // allocated before this method call, so it would be "reg". 2890 // If parameter is small enough to be saved in range [reg, r4), then 2891 // the end (first after last) register would be reg + param-size-in-regs, 2892 // else parameter would be splitted between registers and stack, 2893 // end register would be r4 in this case. 2894 unsigned ByValRegBegin = Reg; 2895 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2896 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2897 // Note, first register is allocated in the beginning of function already, 2898 // allocate remained amount of registers we need. 2899 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2900 State->AllocateReg(GPRArgRegs); 2901 // A byval parameter that is split between registers and memory needs its 2902 // size truncated here. 2903 // In the case where the entire structure fits in registers, we set the 2904 // size in memory to zero. 2905 Size = std::max<int>(Size - Excess, 0); 2906 } 2907 2908 /// MatchingStackOffset - Return true if the given stack call argument is 2909 /// already available in the same position (relatively) of the caller's 2910 /// incoming argument stack. 2911 static 2912 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2913 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2914 const TargetInstrInfo *TII) { 2915 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2916 int FI = std::numeric_limits<int>::max(); 2917 if (Arg.getOpcode() == ISD::CopyFromReg) { 2918 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2919 if (!Register::isVirtualRegister(VR)) 2920 return false; 2921 MachineInstr *Def = MRI->getVRegDef(VR); 2922 if (!Def) 2923 return false; 2924 if (!Flags.isByVal()) { 2925 if (!TII->isLoadFromStackSlot(*Def, FI)) 2926 return false; 2927 } else { 2928 return false; 2929 } 2930 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2931 if (Flags.isByVal()) 2932 // ByVal argument is passed in as a pointer but it's now being 2933 // dereferenced. e.g. 2934 // define @foo(%struct.X* %A) { 2935 // tail call @bar(%struct.X* byval %A) 2936 // } 2937 return false; 2938 SDValue Ptr = Ld->getBasePtr(); 2939 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2940 if (!FINode) 2941 return false; 2942 FI = FINode->getIndex(); 2943 } else 2944 return false; 2945 2946 assert(FI != std::numeric_limits<int>::max()); 2947 if (!MFI.isFixedObjectIndex(FI)) 2948 return false; 2949 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2950 } 2951 2952 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2953 /// for tail call optimization. Targets which want to do tail call 2954 /// optimization should implement this function. 2955 bool ARMTargetLowering::IsEligibleForTailCallOptimization( 2956 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2957 bool isCalleeStructRet, bool isCallerStructRet, 2958 const SmallVectorImpl<ISD::OutputArg> &Outs, 2959 const SmallVectorImpl<SDValue> &OutVals, 2960 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, 2961 const bool isIndirect) const { 2962 MachineFunction &MF = DAG.getMachineFunction(); 2963 const Function &CallerF = MF.getFunction(); 2964 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2965 2966 assert(Subtarget->supportsTailCall()); 2967 2968 // Indirect tail calls cannot be optimized for Thumb1 if the args 2969 // to the call take up r0-r3. The reason is that there are no legal registers 2970 // left to hold the pointer to the function to be called. 2971 // Similarly, if the function uses return address sign and authentication, 2972 // r12 is needed to hold the PAC and is not available to hold the callee 2973 // address. 2974 if (Outs.size() >= 4 && 2975 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) { 2976 if (Subtarget->isThumb1Only()) 2977 return false; 2978 // Conservatively assume the function spills LR. 2979 if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)) 2980 return false; 2981 } 2982 2983 // Look for obvious safe cases to perform tail call optimization that do not 2984 // require ABI changes. This is what gcc calls sibcall. 2985 2986 // Exception-handling functions need a special set of instructions to indicate 2987 // a return to the hardware. Tail-calling another function would probably 2988 // break this. 2989 if (CallerF.hasFnAttribute("interrupt")) 2990 return false; 2991 2992 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) 2993 return CalleeCC == CallerCC; 2994 2995 // Also avoid sibcall optimization if either caller or callee uses struct 2996 // return semantics. 2997 if (isCalleeStructRet || isCallerStructRet) 2998 return false; 2999 3000 // Externally-defined functions with weak linkage should not be 3001 // tail-called on ARM when the OS does not support dynamic 3002 // pre-emption of symbols, as the AAELF spec requires normal calls 3003 // to undefined weak functions to be replaced with a NOP or jump to the 3004 // next instruction. The behaviour of branch instructions in this 3005 // situation (as used for tail calls) is implementation-defined, so we 3006 // cannot rely on the linker replacing the tail call with a return. 3007 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3008 const GlobalValue *GV = G->getGlobal(); 3009 const Triple &TT = getTargetMachine().getTargetTriple(); 3010 if (GV->hasExternalWeakLinkage() && 3011 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 3012 return false; 3013 } 3014 3015 // Check that the call results are passed in the same way. 3016 LLVMContext &C = *DAG.getContext(); 3017 if (!CCState::resultsCompatible( 3018 getEffectiveCallingConv(CalleeCC, isVarArg), 3019 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins, 3020 CCAssignFnForReturn(CalleeCC, isVarArg), 3021 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) 3022 return false; 3023 // The callee has to preserve all registers the caller needs to preserve. 3024 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3025 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 3026 if (CalleeCC != CallerCC) { 3027 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 3028 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 3029 return false; 3030 } 3031 3032 // If Caller's vararg or byval argument has been split between registers and 3033 // stack, do not perform tail call, since part of the argument is in caller's 3034 // local frame. 3035 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 3036 if (AFI_Caller->getArgRegsSaveSize()) 3037 return false; 3038 3039 // If the callee takes no arguments then go on to check the results of the 3040 // call. 3041 if (!Outs.empty()) { 3042 // Check if stack adjustment is needed. For now, do not do this if any 3043 // argument is passed on the stack. 3044 SmallVector<CCValAssign, 16> ArgLocs; 3045 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 3046 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 3047 if (CCInfo.getNextStackOffset()) { 3048 // Check if the arguments are already laid out in the right way as 3049 // the caller's fixed stack objects. 3050 MachineFrameInfo &MFI = MF.getFrameInfo(); 3051 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 3052 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 3053 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 3054 i != e; 3055 ++i, ++realArgIdx) { 3056 CCValAssign &VA = ArgLocs[i]; 3057 EVT RegVT = VA.getLocVT(); 3058 SDValue Arg = OutVals[realArgIdx]; 3059 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 3060 if (VA.getLocInfo() == CCValAssign::Indirect) 3061 return false; 3062 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) { 3063 // f64 and vector types are split into multiple registers or 3064 // register/stack-slot combinations. The types will not match 3065 // the registers; give up on memory f64 refs until we figure 3066 // out what to do about this. 3067 if (!VA.isRegLoc()) 3068 return false; 3069 if (!ArgLocs[++i].isRegLoc()) 3070 return false; 3071 if (RegVT == MVT::v2f64) { 3072 if (!ArgLocs[++i].isRegLoc()) 3073 return false; 3074 if (!ArgLocs[++i].isRegLoc()) 3075 return false; 3076 } 3077 } else if (!VA.isRegLoc()) { 3078 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 3079 MFI, MRI, TII)) 3080 return false; 3081 } 3082 } 3083 } 3084 3085 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3086 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 3087 return false; 3088 } 3089 3090 return true; 3091 } 3092 3093 bool 3094 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 3095 MachineFunction &MF, bool isVarArg, 3096 const SmallVectorImpl<ISD::OutputArg> &Outs, 3097 LLVMContext &Context) const { 3098 SmallVector<CCValAssign, 16> RVLocs; 3099 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 3100 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 3101 } 3102 3103 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 3104 const SDLoc &DL, SelectionDAG &DAG) { 3105 const MachineFunction &MF = DAG.getMachineFunction(); 3106 const Function &F = MF.getFunction(); 3107 3108 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 3109 3110 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 3111 // version of the "preferred return address". These offsets affect the return 3112 // instruction if this is a return from PL1 without hypervisor extensions. 3113 // IRQ/FIQ: +4 "subs pc, lr, #4" 3114 // SWI: 0 "subs pc, lr, #0" 3115 // ABORT: +4 "subs pc, lr, #4" 3116 // UNDEF: +4/+2 "subs pc, lr, #0" 3117 // UNDEF varies depending on where the exception came from ARM or Thumb 3118 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 3119 3120 int64_t LROffset; 3121 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 3122 IntKind == "ABORT") 3123 LROffset = 4; 3124 else if (IntKind == "SWI" || IntKind == "UNDEF") 3125 LROffset = 0; 3126 else 3127 report_fatal_error("Unsupported interrupt attribute. If present, value " 3128 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 3129 3130 RetOps.insert(RetOps.begin() + 1, 3131 DAG.getConstant(LROffset, DL, MVT::i32, false)); 3132 3133 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 3134 } 3135 3136 SDValue 3137 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3138 bool isVarArg, 3139 const SmallVectorImpl<ISD::OutputArg> &Outs, 3140 const SmallVectorImpl<SDValue> &OutVals, 3141 const SDLoc &dl, SelectionDAG &DAG) const { 3142 // CCValAssign - represent the assignment of the return value to a location. 3143 SmallVector<CCValAssign, 16> RVLocs; 3144 3145 // CCState - Info about the registers and stack slots. 3146 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3147 *DAG.getContext()); 3148 3149 // Analyze outgoing return values. 3150 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 3151 3152 SDValue Flag; 3153 SmallVector<SDValue, 4> RetOps; 3154 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 3155 bool isLittleEndian = Subtarget->isLittle(); 3156 3157 MachineFunction &MF = DAG.getMachineFunction(); 3158 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3159 AFI->setReturnRegsCount(RVLocs.size()); 3160 3161 // Report error if cmse entry function returns structure through first ptr arg. 3162 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) { 3163 // Note: using an empty SDLoc(), as the first line of the function is a 3164 // better place to report than the last line. 3165 DiagnosticInfoUnsupported Diag( 3166 DAG.getMachineFunction().getFunction(), 3167 "secure entry function would return value through pointer", 3168 SDLoc().getDebugLoc()); 3169 DAG.getContext()->diagnose(Diag); 3170 } 3171 3172 // Copy the result values into the output registers. 3173 for (unsigned i = 0, realRVLocIdx = 0; 3174 i != RVLocs.size(); 3175 ++i, ++realRVLocIdx) { 3176 CCValAssign &VA = RVLocs[i]; 3177 assert(VA.isRegLoc() && "Can only return in registers!"); 3178 3179 SDValue Arg = OutVals[realRVLocIdx]; 3180 bool ReturnF16 = false; 3181 3182 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 3183 // Half-precision return values can be returned like this: 3184 // 3185 // t11 f16 = fadd ... 3186 // t12: i16 = bitcast t11 3187 // t13: i32 = zero_extend t12 3188 // t14: f32 = bitcast t13 <~~~~~~~ Arg 3189 // 3190 // to avoid code generation for bitcasts, we simply set Arg to the node 3191 // that produces the f16 value, t11 in this case. 3192 // 3193 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 3194 SDValue ZE = Arg.getOperand(0); 3195 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 3196 SDValue BC = ZE.getOperand(0); 3197 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 3198 Arg = BC.getOperand(0); 3199 ReturnF16 = true; 3200 } 3201 } 3202 } 3203 } 3204 3205 switch (VA.getLocInfo()) { 3206 default: llvm_unreachable("Unknown loc info!"); 3207 case CCValAssign::Full: break; 3208 case CCValAssign::BCvt: 3209 if (!ReturnF16) 3210 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 3211 break; 3212 } 3213 3214 // Mask f16 arguments if this is a CMSE nonsecure entry. 3215 auto RetVT = Outs[realRVLocIdx].ArgVT; 3216 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) { 3217 if (VA.needsCustom() && VA.getValVT() == MVT::f16) { 3218 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); 3219 } else { 3220 auto LocBits = VA.getLocVT().getSizeInBits(); 3221 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits()); 3222 SDValue Mask = 3223 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); 3224 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); 3225 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); 3226 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 3227 } 3228 } 3229 3230 if (VA.needsCustom() && 3231 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) { 3232 if (VA.getLocVT() == MVT::v2f64) { 3233 // Extract the first half and return it in two registers. 3234 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 3235 DAG.getConstant(0, dl, MVT::i32)); 3236 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 3237 DAG.getVTList(MVT::i32, MVT::i32), Half); 3238 3239 Chain = 3240 DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3241 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag); 3242 Flag = Chain.getValue(1); 3243 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3244 VA = RVLocs[++i]; // skip ahead to next loc 3245 Chain = 3246 DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3247 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag); 3248 Flag = Chain.getValue(1); 3249 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3250 VA = RVLocs[++i]; // skip ahead to next loc 3251 3252 // Extract the 2nd half and fall through to handle it as an f64 value. 3253 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 3254 DAG.getConstant(1, dl, MVT::i32)); 3255 } 3256 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 3257 // available. 3258 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 3259 DAG.getVTList(MVT::i32, MVT::i32), Arg); 3260 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3261 fmrrd.getValue(isLittleEndian ? 0 : 1), Flag); 3262 Flag = Chain.getValue(1); 3263 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3264 VA = RVLocs[++i]; // skip ahead to next loc 3265 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3266 fmrrd.getValue(isLittleEndian ? 1 : 0), Flag); 3267 } else 3268 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 3269 3270 // Guarantee that all emitted copies are 3271 // stuck together, avoiding something bad. 3272 Flag = Chain.getValue(1); 3273 RetOps.push_back(DAG.getRegister( 3274 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT())); 3275 } 3276 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3277 const MCPhysReg *I = 3278 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3279 if (I) { 3280 for (; *I; ++I) { 3281 if (ARM::GPRRegClass.contains(*I)) 3282 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 3283 else if (ARM::DPRRegClass.contains(*I)) 3284 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 3285 else 3286 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3287 } 3288 } 3289 3290 // Update chain and glue. 3291 RetOps[0] = Chain; 3292 if (Flag.getNode()) 3293 RetOps.push_back(Flag); 3294 3295 // CPUs which aren't M-class use a special sequence to return from 3296 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 3297 // though we use "subs pc, lr, #N"). 3298 // 3299 // M-class CPUs actually use a normal return sequence with a special 3300 // (hardware-provided) value in LR, so the normal code path works. 3301 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 3302 !Subtarget->isMClass()) { 3303 if (Subtarget->isThumb1Only()) 3304 report_fatal_error("interrupt attribute is not supported in Thumb1"); 3305 return LowerInterruptReturn(RetOps, dl, DAG); 3306 } 3307 3308 ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG : 3309 ARMISD::RET_FLAG; 3310 return DAG.getNode(RetNode, dl, MVT::Other, RetOps); 3311 } 3312 3313 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 3314 if (N->getNumValues() != 1) 3315 return false; 3316 if (!N->hasNUsesOfValue(1, 0)) 3317 return false; 3318 3319 SDValue TCChain = Chain; 3320 SDNode *Copy = *N->use_begin(); 3321 if (Copy->getOpcode() == ISD::CopyToReg) { 3322 // If the copy has a glue operand, we conservatively assume it isn't safe to 3323 // perform a tail call. 3324 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 3325 return false; 3326 TCChain = Copy->getOperand(0); 3327 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 3328 SDNode *VMov = Copy; 3329 // f64 returned in a pair of GPRs. 3330 SmallPtrSet<SDNode*, 2> Copies; 3331 for (SDNode *U : VMov->uses()) { 3332 if (U->getOpcode() != ISD::CopyToReg) 3333 return false; 3334 Copies.insert(U); 3335 } 3336 if (Copies.size() > 2) 3337 return false; 3338 3339 for (SDNode *U : VMov->uses()) { 3340 SDValue UseChain = U->getOperand(0); 3341 if (Copies.count(UseChain.getNode())) 3342 // Second CopyToReg 3343 Copy = U; 3344 else { 3345 // We are at the top of this chain. 3346 // If the copy has a glue operand, we conservatively assume it 3347 // isn't safe to perform a tail call. 3348 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue) 3349 return false; 3350 // First CopyToReg 3351 TCChain = UseChain; 3352 } 3353 } 3354 } else if (Copy->getOpcode() == ISD::BITCAST) { 3355 // f32 returned in a single GPR. 3356 if (!Copy->hasOneUse()) 3357 return false; 3358 Copy = *Copy->use_begin(); 3359 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 3360 return false; 3361 // If the copy has a glue operand, we conservatively assume it isn't safe to 3362 // perform a tail call. 3363 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 3364 return false; 3365 TCChain = Copy->getOperand(0); 3366 } else { 3367 return false; 3368 } 3369 3370 bool HasRet = false; 3371 for (const SDNode *U : Copy->uses()) { 3372 if (U->getOpcode() != ARMISD::RET_FLAG && 3373 U->getOpcode() != ARMISD::INTRET_FLAG) 3374 return false; 3375 HasRet = true; 3376 } 3377 3378 if (!HasRet) 3379 return false; 3380 3381 Chain = TCChain; 3382 return true; 3383 } 3384 3385 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 3386 if (!Subtarget->supportsTailCall()) 3387 return false; 3388 3389 if (!CI->isTailCall()) 3390 return false; 3391 3392 return true; 3393 } 3394 3395 // Trying to write a 64 bit value so need to split into two 32 bit values first, 3396 // and pass the lower and high parts through. 3397 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 3398 SDLoc DL(Op); 3399 SDValue WriteValue = Op->getOperand(2); 3400 3401 // This function is only supposed to be called for i64 type argument. 3402 assert(WriteValue.getValueType() == MVT::i64 3403 && "LowerWRITE_REGISTER called for non-i64 type argument."); 3404 3405 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3406 DAG.getConstant(0, DL, MVT::i32)); 3407 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3408 DAG.getConstant(1, DL, MVT::i32)); 3409 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 3410 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 3411 } 3412 3413 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 3414 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 3415 // one of the above mentioned nodes. It has to be wrapped because otherwise 3416 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 3417 // be used to form addressing mode. These wrapped nodes will be selected 3418 // into MOVi. 3419 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 3420 SelectionDAG &DAG) const { 3421 EVT PtrVT = Op.getValueType(); 3422 // FIXME there is no actual debug info here 3423 SDLoc dl(Op); 3424 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3425 SDValue Res; 3426 3427 // When generating execute-only code Constant Pools must be promoted to the 3428 // global data section. It's a bit ugly that we can't share them across basic 3429 // blocks, but this way we guarantee that execute-only behaves correct with 3430 // position-independent addressing modes. 3431 if (Subtarget->genExecuteOnly()) { 3432 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 3433 auto T = const_cast<Type*>(CP->getType()); 3434 auto C = const_cast<Constant*>(CP->getConstVal()); 3435 auto M = const_cast<Module*>(DAG.getMachineFunction(). 3436 getFunction().getParent()); 3437 auto GV = new GlobalVariable( 3438 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, 3439 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 3440 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 3441 Twine(AFI->createPICLabelUId()) 3442 ); 3443 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 3444 dl, PtrVT); 3445 return LowerGlobalAddress(GA, DAG); 3446 } 3447 3448 if (CP->isMachineConstantPoolEntry()) 3449 Res = 3450 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign()); 3451 else 3452 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign()); 3453 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 3454 } 3455 3456 unsigned ARMTargetLowering::getJumpTableEncoding() const { 3457 return MachineJumpTableInfo::EK_Inline; 3458 } 3459 3460 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 3461 SelectionDAG &DAG) const { 3462 MachineFunction &MF = DAG.getMachineFunction(); 3463 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3464 unsigned ARMPCLabelIndex = 0; 3465 SDLoc DL(Op); 3466 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3467 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3468 SDValue CPAddr; 3469 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 3470 if (!IsPositionIndependent) { 3471 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4)); 3472 } else { 3473 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3474 ARMPCLabelIndex = AFI->createPICLabelUId(); 3475 ARMConstantPoolValue *CPV = 3476 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 3477 ARMCP::CPBlockAddress, PCAdj); 3478 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3479 } 3480 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 3481 SDValue Result = DAG.getLoad( 3482 PtrVT, DL, DAG.getEntryNode(), CPAddr, 3483 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3484 if (!IsPositionIndependent) 3485 return Result; 3486 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 3487 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 3488 } 3489 3490 /// Convert a TLS address reference into the correct sequence of loads 3491 /// and calls to compute the variable's address for Darwin, and return an 3492 /// SDValue containing the final node. 3493 3494 /// Darwin only has one TLS scheme which must be capable of dealing with the 3495 /// fully general situation, in the worst case. This means: 3496 /// + "extern __thread" declaration. 3497 /// + Defined in a possibly unknown dynamic library. 3498 /// 3499 /// The general system is that each __thread variable has a [3 x i32] descriptor 3500 /// which contains information used by the runtime to calculate the address. The 3501 /// only part of this the compiler needs to know about is the first word, which 3502 /// contains a function pointer that must be called with the address of the 3503 /// entire descriptor in "r0". 3504 /// 3505 /// Since this descriptor may be in a different unit, in general access must 3506 /// proceed along the usual ARM rules. A common sequence to produce is: 3507 /// 3508 /// movw rT1, :lower16:_var$non_lazy_ptr 3509 /// movt rT1, :upper16:_var$non_lazy_ptr 3510 /// ldr r0, [rT1] 3511 /// ldr rT2, [r0] 3512 /// blx rT2 3513 /// [...address now in r0...] 3514 SDValue 3515 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 3516 SelectionDAG &DAG) const { 3517 assert(Subtarget->isTargetDarwin() && 3518 "This function expects a Darwin target"); 3519 SDLoc DL(Op); 3520 3521 // First step is to get the address of the actua global symbol. This is where 3522 // the TLS descriptor lives. 3523 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 3524 3525 // The first entry in the descriptor is a function pointer that we must call 3526 // to obtain the address of the variable. 3527 SDValue Chain = DAG.getEntryNode(); 3528 SDValue FuncTLVGet = DAG.getLoad( 3529 MVT::i32, DL, Chain, DescAddr, 3530 MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4), 3531 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 3532 MachineMemOperand::MOInvariant); 3533 Chain = FuncTLVGet.getValue(1); 3534 3535 MachineFunction &F = DAG.getMachineFunction(); 3536 MachineFrameInfo &MFI = F.getFrameInfo(); 3537 MFI.setAdjustsStack(true); 3538 3539 // TLS calls preserve all registers except those that absolutely must be 3540 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 3541 // silly). 3542 auto TRI = 3543 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 3544 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 3545 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 3546 3547 // Finally, we can make the call. This is just a degenerate version of a 3548 // normal AArch64 call node: r0 takes the address of the descriptor, and 3549 // returns the address of the variable in this thread. 3550 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 3551 Chain = 3552 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3553 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 3554 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3555 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 3556 } 3557 3558 SDValue 3559 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 3560 SelectionDAG &DAG) const { 3561 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 3562 3563 SDValue Chain = DAG.getEntryNode(); 3564 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3565 SDLoc DL(Op); 3566 3567 // Load the current TEB (thread environment block) 3568 SDValue Ops[] = {Chain, 3569 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 3570 DAG.getTargetConstant(15, DL, MVT::i32), 3571 DAG.getTargetConstant(0, DL, MVT::i32), 3572 DAG.getTargetConstant(13, DL, MVT::i32), 3573 DAG.getTargetConstant(0, DL, MVT::i32), 3574 DAG.getTargetConstant(2, DL, MVT::i32)}; 3575 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 3576 DAG.getVTList(MVT::i32, MVT::Other), Ops); 3577 3578 SDValue TEB = CurrentTEB.getValue(0); 3579 Chain = CurrentTEB.getValue(1); 3580 3581 // Load the ThreadLocalStoragePointer from the TEB 3582 // A pointer to the TLS array is located at offset 0x2c from the TEB. 3583 SDValue TLSArray = 3584 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 3585 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 3586 3587 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 3588 // offset into the TLSArray. 3589 3590 // Load the TLS index from the C runtime 3591 SDValue TLSIndex = 3592 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 3593 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 3594 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 3595 3596 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 3597 DAG.getConstant(2, DL, MVT::i32)); 3598 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 3599 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 3600 MachinePointerInfo()); 3601 3602 // Get the offset of the start of the .tls section (section base) 3603 const auto *GA = cast<GlobalAddressSDNode>(Op); 3604 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 3605 SDValue Offset = DAG.getLoad( 3606 PtrVT, DL, Chain, 3607 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 3608 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))), 3609 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3610 3611 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 3612 } 3613 3614 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 3615 SDValue 3616 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 3617 SelectionDAG &DAG) const { 3618 SDLoc dl(GA); 3619 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3620 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3621 MachineFunction &MF = DAG.getMachineFunction(); 3622 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3623 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3624 ARMConstantPoolValue *CPV = 3625 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3626 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 3627 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3628 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 3629 Argument = DAG.getLoad( 3630 PtrVT, dl, DAG.getEntryNode(), Argument, 3631 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3632 SDValue Chain = Argument.getValue(1); 3633 3634 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3635 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 3636 3637 // call __tls_get_addr. 3638 ArgListTy Args; 3639 ArgListEntry Entry; 3640 Entry.Node = Argument; 3641 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 3642 Args.push_back(Entry); 3643 3644 // FIXME: is there useful debug info available here? 3645 TargetLowering::CallLoweringInfo CLI(DAG); 3646 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3647 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 3648 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 3649 3650 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3651 return CallResult.first; 3652 } 3653 3654 // Lower ISD::GlobalTLSAddress using the "initial exec" or 3655 // "local exec" model. 3656 SDValue 3657 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 3658 SelectionDAG &DAG, 3659 TLSModel::Model model) const { 3660 const GlobalValue *GV = GA->getGlobal(); 3661 SDLoc dl(GA); 3662 SDValue Offset; 3663 SDValue Chain = DAG.getEntryNode(); 3664 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3665 // Get the Thread Pointer 3666 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3667 3668 if (model == TLSModel::InitialExec) { 3669 MachineFunction &MF = DAG.getMachineFunction(); 3670 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3671 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3672 // Initial exec model. 3673 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3674 ARMConstantPoolValue *CPV = 3675 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3676 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3677 true); 3678 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3679 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3680 Offset = DAG.getLoad( 3681 PtrVT, dl, Chain, Offset, 3682 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3683 Chain = Offset.getValue(1); 3684 3685 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3686 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3687 3688 Offset = DAG.getLoad( 3689 PtrVT, dl, Chain, Offset, 3690 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3691 } else { 3692 // local exec model 3693 assert(model == TLSModel::LocalExec); 3694 ARMConstantPoolValue *CPV = 3695 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3696 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3697 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3698 Offset = DAG.getLoad( 3699 PtrVT, dl, Chain, Offset, 3700 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3701 } 3702 3703 // The address of the thread local variable is the add of the thread 3704 // pointer with the offset of the variable. 3705 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3706 } 3707 3708 SDValue 3709 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3710 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3711 if (DAG.getTarget().useEmulatedTLS()) 3712 return LowerToTLSEmulatedModel(GA, DAG); 3713 3714 if (Subtarget->isTargetDarwin()) 3715 return LowerGlobalTLSAddressDarwin(Op, DAG); 3716 3717 if (Subtarget->isTargetWindows()) 3718 return LowerGlobalTLSAddressWindows(Op, DAG); 3719 3720 // TODO: implement the "local dynamic" model 3721 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3722 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3723 3724 switch (model) { 3725 case TLSModel::GeneralDynamic: 3726 case TLSModel::LocalDynamic: 3727 return LowerToTLSGeneralDynamicModel(GA, DAG); 3728 case TLSModel::InitialExec: 3729 case TLSModel::LocalExec: 3730 return LowerToTLSExecModels(GA, DAG, model); 3731 } 3732 llvm_unreachable("bogus TLS model"); 3733 } 3734 3735 /// Return true if all users of V are within function F, looking through 3736 /// ConstantExprs. 3737 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3738 SmallVector<const User*,4> Worklist(V->users()); 3739 while (!Worklist.empty()) { 3740 auto *U = Worklist.pop_back_val(); 3741 if (isa<ConstantExpr>(U)) { 3742 append_range(Worklist, U->users()); 3743 continue; 3744 } 3745 3746 auto *I = dyn_cast<Instruction>(U); 3747 if (!I || I->getParent()->getParent() != F) 3748 return false; 3749 } 3750 return true; 3751 } 3752 3753 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3754 const GlobalValue *GV, SelectionDAG &DAG, 3755 EVT PtrVT, const SDLoc &dl) { 3756 // If we're creating a pool entry for a constant global with unnamed address, 3757 // and the global is small enough, we can emit it inline into the constant pool 3758 // to save ourselves an indirection. 3759 // 3760 // This is a win if the constant is only used in one function (so it doesn't 3761 // need to be duplicated) or duplicating the constant wouldn't increase code 3762 // size (implying the constant is no larger than 4 bytes). 3763 const Function &F = DAG.getMachineFunction().getFunction(); 3764 3765 // We rely on this decision to inline being idemopotent and unrelated to the 3766 // use-site. We know that if we inline a variable at one use site, we'll 3767 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3768 // doesn't know about this optimization, so bail out if it's enabled else 3769 // we could decide to inline here (and thus never emit the GV) but require 3770 // the GV from fast-isel generated code. 3771 if (!EnableConstpoolPromotion || 3772 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3773 return SDValue(); 3774 3775 auto *GVar = dyn_cast<GlobalVariable>(GV); 3776 if (!GVar || !GVar->hasInitializer() || 3777 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3778 !GVar->hasLocalLinkage()) 3779 return SDValue(); 3780 3781 // If we inline a value that contains relocations, we move the relocations 3782 // from .data to .text. This is not allowed in position-independent code. 3783 auto *Init = GVar->getInitializer(); 3784 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3785 Init->needsDynamicRelocation()) 3786 return SDValue(); 3787 3788 // The constant islands pass can only really deal with alignment requests 3789 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3790 // any type wanting greater alignment requirements than 4 bytes. We also 3791 // can only promote constants that are multiples of 4 bytes in size or 3792 // are paddable to a multiple of 4. Currently we only try and pad constants 3793 // that are strings for simplicity. 3794 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3795 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3796 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar); 3797 unsigned RequiredPadding = 4 - (Size % 4); 3798 bool PaddingPossible = 3799 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3800 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize || 3801 Size == 0) 3802 return SDValue(); 3803 3804 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3805 MachineFunction &MF = DAG.getMachineFunction(); 3806 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3807 3808 // We can't bloat the constant pool too much, else the ConstantIslands pass 3809 // may fail to converge. If we haven't promoted this global yet (it may have 3810 // multiple uses), and promoting it would increase the constant pool size (Sz 3811 // > 4), ensure we have space to do so up to MaxTotal. 3812 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3813 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3814 ConstpoolPromotionMaxTotal) 3815 return SDValue(); 3816 3817 // This is only valid if all users are in a single function; we can't clone 3818 // the constant in general. The LLVM IR unnamed_addr allows merging 3819 // constants, but not cloning them. 3820 // 3821 // We could potentially allow cloning if we could prove all uses of the 3822 // constant in the current function don't care about the address, like 3823 // printf format strings. But that isn't implemented for now. 3824 if (!allUsersAreInFunction(GVar, &F)) 3825 return SDValue(); 3826 3827 // We're going to inline this global. Pad it out if needed. 3828 if (RequiredPadding != 4) { 3829 StringRef S = CDAInit->getAsString(); 3830 3831 SmallVector<uint8_t,16> V(S.size()); 3832 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3833 while (RequiredPadding--) 3834 V.push_back(0); 3835 Init = ConstantDataArray::get(*DAG.getContext(), V); 3836 } 3837 3838 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3839 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4)); 3840 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3841 AFI->markGlobalAsPromotedToConstantPool(GVar); 3842 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3843 PaddedSize - 4); 3844 } 3845 ++NumConstpoolPromoted; 3846 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3847 } 3848 3849 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3850 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3851 if (!(GV = GA->getAliaseeObject())) 3852 return false; 3853 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3854 return V->isConstant(); 3855 return isa<Function>(GV); 3856 } 3857 3858 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3859 SelectionDAG &DAG) const { 3860 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3861 default: llvm_unreachable("unknown object format"); 3862 case Triple::COFF: 3863 return LowerGlobalAddressWindows(Op, DAG); 3864 case Triple::ELF: 3865 return LowerGlobalAddressELF(Op, DAG); 3866 case Triple::MachO: 3867 return LowerGlobalAddressDarwin(Op, DAG); 3868 } 3869 } 3870 3871 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3872 SelectionDAG &DAG) const { 3873 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3874 SDLoc dl(Op); 3875 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3876 const TargetMachine &TM = getTargetMachine(); 3877 bool IsRO = isReadOnly(GV); 3878 3879 // promoteToConstantPool only if not generating XO text section 3880 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3881 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3882 return V; 3883 3884 if (isPositionIndependent()) { 3885 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3886 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3887 UseGOT_PREL ? ARMII::MO_GOT : 0); 3888 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3889 if (UseGOT_PREL) 3890 Result = 3891 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3892 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3893 return Result; 3894 } else if (Subtarget->isROPI() && IsRO) { 3895 // PC-relative. 3896 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3897 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3898 return Result; 3899 } else if (Subtarget->isRWPI() && !IsRO) { 3900 // SB-relative. 3901 SDValue RelAddr; 3902 if (Subtarget->useMovt()) { 3903 ++NumMovwMovt; 3904 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3905 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3906 } else { // use literal pool for address constant 3907 ARMConstantPoolValue *CPV = 3908 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3909 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3910 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3911 RelAddr = DAG.getLoad( 3912 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3913 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3914 } 3915 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3916 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3917 return Result; 3918 } 3919 3920 // If we have T2 ops, we can materialize the address directly via movt/movw 3921 // pair. This is always cheaper. 3922 if (Subtarget->useMovt()) { 3923 ++NumMovwMovt; 3924 // FIXME: Once remat is capable of dealing with instructions with register 3925 // operands, expand this into two nodes. 3926 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3927 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3928 } else { 3929 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4)); 3930 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3931 return DAG.getLoad( 3932 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3933 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3934 } 3935 } 3936 3937 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3938 SelectionDAG &DAG) const { 3939 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3940 "ROPI/RWPI not currently supported for Darwin"); 3941 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3942 SDLoc dl(Op); 3943 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3944 3945 if (Subtarget->useMovt()) 3946 ++NumMovwMovt; 3947 3948 // FIXME: Once remat is capable of dealing with instructions with register 3949 // operands, expand this into multiple nodes 3950 unsigned Wrapper = 3951 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3952 3953 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3954 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3955 3956 if (Subtarget->isGVIndirectSymbol(GV)) 3957 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3958 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3959 return Result; 3960 } 3961 3962 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3963 SelectionDAG &DAG) const { 3964 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3965 assert(Subtarget->useMovt() && 3966 "Windows on ARM expects to use movw/movt"); 3967 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3968 "ROPI/RWPI not currently supported for Windows"); 3969 3970 const TargetMachine &TM = getTargetMachine(); 3971 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3972 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3973 if (GV->hasDLLImportStorageClass()) 3974 TargetFlags = ARMII::MO_DLLIMPORT; 3975 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3976 TargetFlags = ARMII::MO_COFFSTUB; 3977 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3978 SDValue Result; 3979 SDLoc DL(Op); 3980 3981 ++NumMovwMovt; 3982 3983 // FIXME: Once remat is capable of dealing with instructions with register 3984 // operands, expand this into two nodes. 3985 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3986 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, 3987 TargetFlags)); 3988 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3989 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3990 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3991 return Result; 3992 } 3993 3994 SDValue 3995 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3996 SDLoc dl(Op); 3997 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3998 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3999 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 4000 Op.getOperand(1), Val); 4001 } 4002 4003 SDValue 4004 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 4005 SDLoc dl(Op); 4006 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 4007 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 4008 } 4009 4010 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 4011 SelectionDAG &DAG) const { 4012 SDLoc dl(Op); 4013 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 4014 Op.getOperand(0)); 4015 } 4016 4017 SDValue ARMTargetLowering::LowerINTRINSIC_VOID( 4018 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { 4019 unsigned IntNo = 4020 cast<ConstantSDNode>( 4021 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) 4022 ->getZExtValue(); 4023 switch (IntNo) { 4024 default: 4025 return SDValue(); // Don't custom lower most intrinsics. 4026 case Intrinsic::arm_gnu_eabi_mcount: { 4027 MachineFunction &MF = DAG.getMachineFunction(); 4028 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4029 SDLoc dl(Op); 4030 SDValue Chain = Op.getOperand(0); 4031 // call "\01__gnu_mcount_nc" 4032 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 4033 const uint32_t *Mask = 4034 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); 4035 assert(Mask && "Missing call preserved mask for calling convention"); 4036 // Mark LR an implicit live-in. 4037 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 4038 SDValue ReturnAddress = 4039 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); 4040 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue}; 4041 SDValue Callee = 4042 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); 4043 SDValue RegisterMask = DAG.getRegisterMask(Mask); 4044 if (Subtarget->isThumb()) 4045 return SDValue( 4046 DAG.getMachineNode( 4047 ARM::tBL_PUSHLR, dl, ResultTys, 4048 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), 4049 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), 4050 0); 4051 return SDValue( 4052 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, 4053 {ReturnAddress, Callee, RegisterMask, Chain}), 4054 0); 4055 } 4056 } 4057 } 4058 4059 SDValue 4060 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 4061 const ARMSubtarget *Subtarget) const { 4062 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4063 SDLoc dl(Op); 4064 switch (IntNo) { 4065 default: return SDValue(); // Don't custom lower most intrinsics. 4066 case Intrinsic::thread_pointer: { 4067 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4068 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 4069 } 4070 case Intrinsic::arm_cls: { 4071 const SDValue &Operand = Op.getOperand(1); 4072 const EVT VTy = Op.getValueType(); 4073 SDValue SRA = 4074 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); 4075 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); 4076 SDValue SHL = 4077 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); 4078 SDValue OR = 4079 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); 4080 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); 4081 return Result; 4082 } 4083 case Intrinsic::arm_cls64: { 4084 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) 4085 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) 4086 const SDValue &Operand = Op.getOperand(1); 4087 const EVT VTy = Op.getValueType(); 4088 4089 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 4090 DAG.getConstant(1, dl, VTy)); 4091 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 4092 DAG.getConstant(0, dl, VTy)); 4093 SDValue Constant0 = DAG.getConstant(0, dl, VTy); 4094 SDValue Constant1 = DAG.getConstant(1, dl, VTy); 4095 SDValue Constant31 = DAG.getConstant(31, dl, VTy); 4096 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); 4097 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); 4098 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); 4099 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); 4100 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); 4101 SDValue CheckLo = 4102 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); 4103 SDValue HiIsZero = 4104 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); 4105 SDValue AdjustedLo = 4106 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); 4107 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); 4108 SDValue Result = 4109 DAG.getSelect(dl, VTy, CheckLo, 4110 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); 4111 return Result; 4112 } 4113 case Intrinsic::eh_sjlj_lsda: { 4114 MachineFunction &MF = DAG.getMachineFunction(); 4115 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4116 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 4117 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4118 SDValue CPAddr; 4119 bool IsPositionIndependent = isPositionIndependent(); 4120 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 4121 ARMConstantPoolValue *CPV = 4122 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 4123 ARMCP::CPLSDA, PCAdj); 4124 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 4125 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 4126 SDValue Result = DAG.getLoad( 4127 PtrVT, dl, DAG.getEntryNode(), CPAddr, 4128 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 4129 4130 if (IsPositionIndependent) { 4131 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 4132 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 4133 } 4134 return Result; 4135 } 4136 case Intrinsic::arm_neon_vabs: 4137 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 4138 Op.getOperand(1)); 4139 case Intrinsic::arm_neon_vmulls: 4140 case Intrinsic::arm_neon_vmullu: { 4141 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 4142 ? ARMISD::VMULLs : ARMISD::VMULLu; 4143 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4144 Op.getOperand(1), Op.getOperand(2)); 4145 } 4146 case Intrinsic::arm_neon_vminnm: 4147 case Intrinsic::arm_neon_vmaxnm: { 4148 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 4149 ? ISD::FMINNUM : ISD::FMAXNUM; 4150 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4151 Op.getOperand(1), Op.getOperand(2)); 4152 } 4153 case Intrinsic::arm_neon_vminu: 4154 case Intrinsic::arm_neon_vmaxu: { 4155 if (Op.getValueType().isFloatingPoint()) 4156 return SDValue(); 4157 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 4158 ? ISD::UMIN : ISD::UMAX; 4159 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4160 Op.getOperand(1), Op.getOperand(2)); 4161 } 4162 case Intrinsic::arm_neon_vmins: 4163 case Intrinsic::arm_neon_vmaxs: { 4164 // v{min,max}s is overloaded between signed integers and floats. 4165 if (!Op.getValueType().isFloatingPoint()) { 4166 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 4167 ? ISD::SMIN : ISD::SMAX; 4168 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4169 Op.getOperand(1), Op.getOperand(2)); 4170 } 4171 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 4172 ? ISD::FMINIMUM : ISD::FMAXIMUM; 4173 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 4174 Op.getOperand(1), Op.getOperand(2)); 4175 } 4176 case Intrinsic::arm_neon_vtbl1: 4177 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 4178 Op.getOperand(1), Op.getOperand(2)); 4179 case Intrinsic::arm_neon_vtbl2: 4180 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 4181 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4182 case Intrinsic::arm_mve_pred_i2v: 4183 case Intrinsic::arm_mve_pred_v2i: 4184 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), 4185 Op.getOperand(1)); 4186 case Intrinsic::arm_mve_vreinterpretq: 4187 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(), 4188 Op.getOperand(1)); 4189 case Intrinsic::arm_mve_lsll: 4190 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(), 4191 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4192 case Intrinsic::arm_mve_asrl: 4193 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(), 4194 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4195 } 4196 } 4197 4198 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 4199 const ARMSubtarget *Subtarget) { 4200 SDLoc dl(Op); 4201 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 4202 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 4203 if (SSID == SyncScope::SingleThread) 4204 return Op; 4205 4206 if (!Subtarget->hasDataBarrier()) { 4207 // Some ARMv6 cpus can support data barriers with an mcr instruction. 4208 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 4209 // here. 4210 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 4211 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 4212 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 4213 DAG.getConstant(0, dl, MVT::i32)); 4214 } 4215 4216 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 4217 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 4218 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 4219 if (Subtarget->isMClass()) { 4220 // Only a full system barrier exists in the M-class architectures. 4221 Domain = ARM_MB::SY; 4222 } else if (Subtarget->preferISHSTBarriers() && 4223 Ord == AtomicOrdering::Release) { 4224 // Swift happens to implement ISHST barriers in a way that's compatible with 4225 // Release semantics but weaker than ISH so we'd be fools not to use 4226 // it. Beware: other processors probably don't! 4227 Domain = ARM_MB::ISHST; 4228 } 4229 4230 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 4231 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 4232 DAG.getConstant(Domain, dl, MVT::i32)); 4233 } 4234 4235 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 4236 const ARMSubtarget *Subtarget) { 4237 // ARM pre v5TE and Thumb1 does not have preload instructions. 4238 if (!(Subtarget->isThumb2() || 4239 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 4240 // Just preserve the chain. 4241 return Op.getOperand(0); 4242 4243 SDLoc dl(Op); 4244 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 4245 if (!isRead && 4246 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 4247 // ARMv7 with MP extension has PLDW. 4248 return Op.getOperand(0); 4249 4250 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 4251 if (Subtarget->isThumb()) { 4252 // Invert the bits. 4253 isRead = ~isRead & 1; 4254 isData = ~isData & 1; 4255 } 4256 4257 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 4258 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 4259 DAG.getConstant(isData, dl, MVT::i32)); 4260 } 4261 4262 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 4263 MachineFunction &MF = DAG.getMachineFunction(); 4264 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 4265 4266 // vastart just stores the address of the VarArgsFrameIndex slot into the 4267 // memory location argument. 4268 SDLoc dl(Op); 4269 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4270 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 4271 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4272 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 4273 MachinePointerInfo(SV)); 4274 } 4275 4276 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 4277 CCValAssign &NextVA, 4278 SDValue &Root, 4279 SelectionDAG &DAG, 4280 const SDLoc &dl) const { 4281 MachineFunction &MF = DAG.getMachineFunction(); 4282 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4283 4284 const TargetRegisterClass *RC; 4285 if (AFI->isThumb1OnlyFunction()) 4286 RC = &ARM::tGPRRegClass; 4287 else 4288 RC = &ARM::GPRRegClass; 4289 4290 // Transform the arguments stored in physical registers into virtual ones. 4291 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 4292 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 4293 4294 SDValue ArgValue2; 4295 if (NextVA.isMemLoc()) { 4296 MachineFrameInfo &MFI = MF.getFrameInfo(); 4297 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 4298 4299 // Create load node to retrieve arguments from the stack. 4300 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 4301 ArgValue2 = DAG.getLoad( 4302 MVT::i32, dl, Root, FIN, 4303 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 4304 } else { 4305 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 4306 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 4307 } 4308 if (!Subtarget->isLittle()) 4309 std::swap (ArgValue, ArgValue2); 4310 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 4311 } 4312 4313 // The remaining GPRs hold either the beginning of variable-argument 4314 // data, or the beginning of an aggregate passed by value (usually 4315 // byval). Either way, we allocate stack slots adjacent to the data 4316 // provided by our caller, and store the unallocated registers there. 4317 // If this is a variadic function, the va_list pointer will begin with 4318 // these values; otherwise, this reassembles a (byval) structure that 4319 // was split between registers and memory. 4320 // Return: The frame index registers were stored into. 4321 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 4322 const SDLoc &dl, SDValue &Chain, 4323 const Value *OrigArg, 4324 unsigned InRegsParamRecordIdx, 4325 int ArgOffset, unsigned ArgSize) const { 4326 // Currently, two use-cases possible: 4327 // Case #1. Non-var-args function, and we meet first byval parameter. 4328 // Setup first unallocated register as first byval register; 4329 // eat all remained registers 4330 // (these two actions are performed by HandleByVal method). 4331 // Then, here, we initialize stack frame with 4332 // "store-reg" instructions. 4333 // Case #2. Var-args function, that doesn't contain byval parameters. 4334 // The same: eat all remained unallocated registers, 4335 // initialize stack frame. 4336 4337 MachineFunction &MF = DAG.getMachineFunction(); 4338 MachineFrameInfo &MFI = MF.getFrameInfo(); 4339 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4340 unsigned RBegin, REnd; 4341 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 4342 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 4343 } else { 4344 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4345 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 4346 REnd = ARM::R4; 4347 } 4348 4349 if (REnd != RBegin) 4350 ArgOffset = -4 * (ARM::R4 - RBegin); 4351 4352 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4353 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 4354 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 4355 4356 SmallVector<SDValue, 4> MemOps; 4357 const TargetRegisterClass *RC = 4358 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 4359 4360 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 4361 Register VReg = MF.addLiveIn(Reg, RC); 4362 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 4363 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 4364 MachinePointerInfo(OrigArg, 4 * i)); 4365 MemOps.push_back(Store); 4366 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 4367 } 4368 4369 if (!MemOps.empty()) 4370 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 4371 return FrameIndex; 4372 } 4373 4374 // Setup stack frame, the va_list pointer will start from. 4375 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 4376 const SDLoc &dl, SDValue &Chain, 4377 unsigned ArgOffset, 4378 unsigned TotalArgRegsSaveSize, 4379 bool ForceMutable) const { 4380 MachineFunction &MF = DAG.getMachineFunction(); 4381 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4382 4383 // Try to store any remaining integer argument regs 4384 // to their spots on the stack so that they may be loaded by dereferencing 4385 // the result of va_next. 4386 // If there is no regs to be stored, just point address after last 4387 // argument passed via stack. 4388 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 4389 CCInfo.getInRegsParamsCount(), 4390 CCInfo.getNextStackOffset(), 4391 std::max(4U, TotalArgRegsSaveSize)); 4392 AFI->setVarArgsFrameIndex(FrameIndex); 4393 } 4394 4395 bool ARMTargetLowering::splitValueIntoRegisterParts( 4396 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, 4397 unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const { 4398 bool IsABIRegCopy = CC.has_value(); 4399 EVT ValueVT = Val.getValueType(); 4400 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && 4401 PartVT == MVT::f32) { 4402 unsigned ValueBits = ValueVT.getSizeInBits(); 4403 unsigned PartBits = PartVT.getSizeInBits(); 4404 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); 4405 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); 4406 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); 4407 Parts[0] = Val; 4408 return true; 4409 } 4410 return false; 4411 } 4412 4413 SDValue ARMTargetLowering::joinRegisterPartsIntoValue( 4414 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, 4415 MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const { 4416 bool IsABIRegCopy = CC.has_value(); 4417 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && 4418 PartVT == MVT::f32) { 4419 unsigned ValueBits = ValueVT.getSizeInBits(); 4420 unsigned PartBits = PartVT.getSizeInBits(); 4421 SDValue Val = Parts[0]; 4422 4423 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); 4424 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); 4425 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); 4426 return Val; 4427 } 4428 return SDValue(); 4429 } 4430 4431 SDValue ARMTargetLowering::LowerFormalArguments( 4432 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 4433 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 4434 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 4435 MachineFunction &MF = DAG.getMachineFunction(); 4436 MachineFrameInfo &MFI = MF.getFrameInfo(); 4437 4438 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4439 4440 // Assign locations to all of the incoming arguments. 4441 SmallVector<CCValAssign, 16> ArgLocs; 4442 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 4443 *DAG.getContext()); 4444 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 4445 4446 SmallVector<SDValue, 16> ArgValues; 4447 SDValue ArgValue; 4448 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 4449 unsigned CurArgIdx = 0; 4450 4451 // Initially ArgRegsSaveSize is zero. 4452 // Then we increase this value each time we meet byval parameter. 4453 // We also increase this value in case of varargs function. 4454 AFI->setArgRegsSaveSize(0); 4455 4456 // Calculate the amount of stack space that we need to allocate to store 4457 // byval and variadic arguments that are passed in registers. 4458 // We need to know this before we allocate the first byval or variadic 4459 // argument, as they will be allocated a stack slot below the CFA (Canonical 4460 // Frame Address, the stack pointer at entry to the function). 4461 unsigned ArgRegBegin = ARM::R4; 4462 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4463 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 4464 break; 4465 4466 CCValAssign &VA = ArgLocs[i]; 4467 unsigned Index = VA.getValNo(); 4468 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 4469 if (!Flags.isByVal()) 4470 continue; 4471 4472 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 4473 unsigned RBegin, REnd; 4474 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 4475 ArgRegBegin = std::min(ArgRegBegin, RBegin); 4476 4477 CCInfo.nextInRegsParam(); 4478 } 4479 CCInfo.rewindByValRegsInfo(); 4480 4481 int lastInsIndex = -1; 4482 if (isVarArg && MFI.hasVAStart()) { 4483 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4484 if (RegIdx != array_lengthof(GPRArgRegs)) 4485 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 4486 } 4487 4488 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 4489 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 4490 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4491 4492 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4493 CCValAssign &VA = ArgLocs[i]; 4494 if (Ins[VA.getValNo()].isOrigArg()) { 4495 std::advance(CurOrigArg, 4496 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 4497 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 4498 } 4499 // Arguments stored in registers. 4500 if (VA.isRegLoc()) { 4501 EVT RegVT = VA.getLocVT(); 4502 4503 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { 4504 // f64 and vector types are split up into multiple registers or 4505 // combinations of registers and stack slots. 4506 SDValue ArgValue1 = 4507 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4508 VA = ArgLocs[++i]; // skip ahead to next loc 4509 SDValue ArgValue2; 4510 if (VA.isMemLoc()) { 4511 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 4512 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4513 ArgValue2 = DAG.getLoad( 4514 MVT::f64, dl, Chain, FIN, 4515 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 4516 } else { 4517 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4518 } 4519 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 4520 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, 4521 ArgValue1, DAG.getIntPtrConstant(0, dl)); 4522 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, 4523 ArgValue2, DAG.getIntPtrConstant(1, dl)); 4524 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { 4525 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4526 } else { 4527 const TargetRegisterClass *RC; 4528 4529 if (RegVT == MVT::f16 || RegVT == MVT::bf16) 4530 RC = &ARM::HPRRegClass; 4531 else if (RegVT == MVT::f32) 4532 RC = &ARM::SPRRegClass; 4533 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 || 4534 RegVT == MVT::v4bf16) 4535 RC = &ARM::DPRRegClass; 4536 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 || 4537 RegVT == MVT::v8bf16) 4538 RC = &ARM::QPRRegClass; 4539 else if (RegVT == MVT::i32) 4540 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 4541 : &ARM::GPRRegClass; 4542 else 4543 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 4544 4545 // Transform the arguments in physical registers into virtual ones. 4546 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 4547 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 4548 4549 // If this value is passed in r0 and has the returned attribute (e.g. 4550 // C++ 'structors), record this fact for later use. 4551 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { 4552 AFI->setPreservesR0(); 4553 } 4554 } 4555 4556 // If this is an 8 or 16-bit value, it is really passed promoted 4557 // to 32 bits. Insert an assert[sz]ext to capture this, then 4558 // truncate to the right size. 4559 switch (VA.getLocInfo()) { 4560 default: llvm_unreachable("Unknown loc info!"); 4561 case CCValAssign::Full: break; 4562 case CCValAssign::BCvt: 4563 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 4564 break; 4565 case CCValAssign::SExt: 4566 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 4567 DAG.getValueType(VA.getValVT())); 4568 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4569 break; 4570 case CCValAssign::ZExt: 4571 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 4572 DAG.getValueType(VA.getValVT())); 4573 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4574 break; 4575 } 4576 4577 // f16 arguments have their size extended to 4 bytes and passed as if they 4578 // had been copied to the LSBs of a 32-bit register. 4579 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 4580 if (VA.needsCustom() && 4581 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) 4582 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue); 4583 4584 InVals.push_back(ArgValue); 4585 } else { // VA.isRegLoc() 4586 // Only arguments passed on the stack should make it here. 4587 assert(VA.isMemLoc()); 4588 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 4589 4590 int index = VA.getValNo(); 4591 4592 // Some Ins[] entries become multiple ArgLoc[] entries. 4593 // Process them only once. 4594 if (index != lastInsIndex) 4595 { 4596 ISD::ArgFlagsTy Flags = Ins[index].Flags; 4597 // FIXME: For now, all byval parameter objects are marked mutable. 4598 // This can be changed with more analysis. 4599 // In case of tail call optimization mark all arguments mutable. 4600 // Since they could be overwritten by lowering of arguments in case of 4601 // a tail call. 4602 if (Flags.isByVal()) { 4603 assert(Ins[index].isOrigArg() && 4604 "Byval arguments cannot be implicit"); 4605 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 4606 4607 int FrameIndex = StoreByValRegs( 4608 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 4609 VA.getLocMemOffset(), Flags.getByValSize()); 4610 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 4611 CCInfo.nextInRegsParam(); 4612 } else { 4613 unsigned FIOffset = VA.getLocMemOffset(); 4614 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 4615 FIOffset, true); 4616 4617 // Create load nodes to retrieve arguments from the stack. 4618 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4619 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 4620 MachinePointerInfo::getFixedStack( 4621 DAG.getMachineFunction(), FI))); 4622 } 4623 lastInsIndex = index; 4624 } 4625 } 4626 } 4627 4628 // varargs 4629 if (isVarArg && MFI.hasVAStart()) { 4630 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(), 4631 TotalArgRegsSaveSize); 4632 if (AFI->isCmseNSEntryFunction()) { 4633 DiagnosticInfoUnsupported Diag( 4634 DAG.getMachineFunction().getFunction(), 4635 "secure entry function must not be variadic", dl.getDebugLoc()); 4636 DAG.getContext()->diagnose(Diag); 4637 } 4638 } 4639 4640 unsigned StackArgSize = CCInfo.getNextStackOffset(); 4641 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 4642 if (canGuaranteeTCO(CallConv, TailCallOpt)) { 4643 // The only way to guarantee a tail call is if the callee restores its 4644 // argument area, but it must also keep the stack aligned when doing so. 4645 const DataLayout &DL = DAG.getDataLayout(); 4646 StackArgSize = alignTo(StackArgSize, DL.getStackAlignment()); 4647 4648 AFI->setArgumentStackToRestore(StackArgSize); 4649 } 4650 AFI->setArgumentStackSize(StackArgSize); 4651 4652 if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) { 4653 DiagnosticInfoUnsupported Diag( 4654 DAG.getMachineFunction().getFunction(), 4655 "secure entry function requires arguments on stack", dl.getDebugLoc()); 4656 DAG.getContext()->diagnose(Diag); 4657 } 4658 4659 return Chain; 4660 } 4661 4662 /// isFloatingPointZero - Return true if this is +0.0. 4663 static bool isFloatingPointZero(SDValue Op) { 4664 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 4665 return CFP->getValueAPF().isPosZero(); 4666 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 4667 // Maybe this has already been legalized into the constant pool? 4668 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 4669 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 4670 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 4671 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 4672 return CFP->getValueAPF().isPosZero(); 4673 } 4674 } else if (Op->getOpcode() == ISD::BITCAST && 4675 Op->getValueType(0) == MVT::f64) { 4676 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 4677 // created by LowerConstantFP(). 4678 SDValue BitcastOp = Op->getOperand(0); 4679 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 4680 isNullConstant(BitcastOp->getOperand(0))) 4681 return true; 4682 } 4683 return false; 4684 } 4685 4686 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 4687 /// the given operands. 4688 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 4689 SDValue &ARMcc, SelectionDAG &DAG, 4690 const SDLoc &dl) const { 4691 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 4692 unsigned C = RHSC->getZExtValue(); 4693 if (!isLegalICmpImmediate((int32_t)C)) { 4694 // Constant does not fit, try adjusting it by one. 4695 switch (CC) { 4696 default: break; 4697 case ISD::SETLT: 4698 case ISD::SETGE: 4699 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 4700 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 4701 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4702 } 4703 break; 4704 case ISD::SETULT: 4705 case ISD::SETUGE: 4706 if (C != 0 && isLegalICmpImmediate(C-1)) { 4707 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 4708 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4709 } 4710 break; 4711 case ISD::SETLE: 4712 case ISD::SETGT: 4713 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 4714 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 4715 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4716 } 4717 break; 4718 case ISD::SETULE: 4719 case ISD::SETUGT: 4720 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 4721 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 4722 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4723 } 4724 break; 4725 } 4726 } 4727 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 4728 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 4729 // In ARM and Thumb-2, the compare instructions can shift their second 4730 // operand. 4731 CC = ISD::getSetCCSwappedOperands(CC); 4732 std::swap(LHS, RHS); 4733 } 4734 4735 // Thumb1 has very limited immediate modes, so turning an "and" into a 4736 // shift can save multiple instructions. 4737 // 4738 // If we have (x & C1), and C1 is an appropriate mask, we can transform it 4739 // into "((x << n) >> n)". But that isn't necessarily profitable on its 4740 // own. If it's the operand to an unsigned comparison with an immediate, 4741 // we can eliminate one of the shifts: we transform 4742 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". 4743 // 4744 // We avoid transforming cases which aren't profitable due to encoding 4745 // details: 4746 // 4747 // 1. C2 fits into the immediate field of a cmp, and the transformed version 4748 // would not; in that case, we're essentially trading one immediate load for 4749 // another. 4750 // 2. C1 is 255 or 65535, so we can use uxtb or uxth. 4751 // 3. C2 is zero; we have other code for this special case. 4752 // 4753 // FIXME: Figure out profitability for Thumb2; we usually can't save an 4754 // instruction, since the AND is always one instruction anyway, but we could 4755 // use narrow instructions in some cases. 4756 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && 4757 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && 4758 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && 4759 !isSignedIntSetCC(CC)) { 4760 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); 4761 auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); 4762 uint64_t RHSV = RHSC->getZExtValue(); 4763 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { 4764 unsigned ShiftBits = countLeadingZeros(Mask); 4765 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { 4766 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); 4767 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); 4768 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); 4769 } 4770 } 4771 } 4772 4773 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a 4774 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same 4775 // way a cmp would. 4776 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and 4777 // some tweaks to the heuristics for the previous and->shift transform. 4778 // FIXME: Optimize cases where the LHS isn't a shift. 4779 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && 4780 isa<ConstantSDNode>(RHS) && 4781 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && 4782 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && 4783 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { 4784 unsigned ShiftAmt = 4785 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; 4786 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, 4787 DAG.getVTList(MVT::i32, MVT::i32), 4788 LHS.getOperand(0), 4789 DAG.getConstant(ShiftAmt, dl, MVT::i32)); 4790 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 4791 Shift.getValue(1), SDValue()); 4792 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); 4793 return Chain.getValue(1); 4794 } 4795 4796 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4797 4798 // If the RHS is a constant zero then the V (overflow) flag will never be 4799 // set. This can allow us to simplify GE to PL or LT to MI, which can be 4800 // simpler for other passes (like the peephole optimiser) to deal with. 4801 if (isNullConstant(RHS)) { 4802 switch (CondCode) { 4803 default: break; 4804 case ARMCC::GE: 4805 CondCode = ARMCC::PL; 4806 break; 4807 case ARMCC::LT: 4808 CondCode = ARMCC::MI; 4809 break; 4810 } 4811 } 4812 4813 ARMISD::NodeType CompareType; 4814 switch (CondCode) { 4815 default: 4816 CompareType = ARMISD::CMP; 4817 break; 4818 case ARMCC::EQ: 4819 case ARMCC::NE: 4820 // Uses only Z Flag 4821 CompareType = ARMISD::CMPZ; 4822 break; 4823 } 4824 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4825 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 4826 } 4827 4828 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 4829 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 4830 SelectionDAG &DAG, const SDLoc &dl, 4831 bool Signaling) const { 4832 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); 4833 SDValue Cmp; 4834 if (!isFloatingPointZero(RHS)) 4835 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, 4836 dl, MVT::Glue, LHS, RHS); 4837 else 4838 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, 4839 dl, MVT::Glue, LHS); 4840 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 4841 } 4842 4843 /// duplicateCmp - Glue values can have only one use, so this function 4844 /// duplicates a comparison node. 4845 SDValue 4846 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 4847 unsigned Opc = Cmp.getOpcode(); 4848 SDLoc DL(Cmp); 4849 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 4850 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4851 4852 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 4853 Cmp = Cmp.getOperand(0); 4854 Opc = Cmp.getOpcode(); 4855 if (Opc == ARMISD::CMPFP) 4856 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4857 else { 4858 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 4859 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 4860 } 4861 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 4862 } 4863 4864 // This function returns three things: the arithmetic computation itself 4865 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 4866 // comparison and the condition code define the case in which the arithmetic 4867 // computation *does not* overflow. 4868 std::pair<SDValue, SDValue> 4869 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 4870 SDValue &ARMcc) const { 4871 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 4872 4873 SDValue Value, OverflowCmp; 4874 SDValue LHS = Op.getOperand(0); 4875 SDValue RHS = Op.getOperand(1); 4876 SDLoc dl(Op); 4877 4878 // FIXME: We are currently always generating CMPs because we don't support 4879 // generating CMN through the backend. This is not as good as the natural 4880 // CMP case because it causes a register dependency and cannot be folded 4881 // later. 4882 4883 switch (Op.getOpcode()) { 4884 default: 4885 llvm_unreachable("Unknown overflow instruction!"); 4886 case ISD::SADDO: 4887 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4888 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 4889 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4890 break; 4891 case ISD::UADDO: 4892 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4893 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 4894 // We do not use it in the USUBO case as Value may not be used. 4895 Value = DAG.getNode(ARMISD::ADDC, dl, 4896 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 4897 .getValue(0); 4898 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4899 break; 4900 case ISD::SSUBO: 4901 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4902 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4903 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4904 break; 4905 case ISD::USUBO: 4906 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4907 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4908 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4909 break; 4910 case ISD::UMULO: 4911 // We generate a UMUL_LOHI and then check if the high word is 0. 4912 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4913 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4914 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4915 LHS, RHS); 4916 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4917 DAG.getConstant(0, dl, MVT::i32)); 4918 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4919 break; 4920 case ISD::SMULO: 4921 // We generate a SMUL_LOHI and then check if all the bits of the high word 4922 // are the same as the sign bit of the low word. 4923 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4924 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4925 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4926 LHS, RHS); 4927 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4928 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4929 Value.getValue(0), 4930 DAG.getConstant(31, dl, MVT::i32))); 4931 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4932 break; 4933 } // switch (...) 4934 4935 return std::make_pair(Value, OverflowCmp); 4936 } 4937 4938 SDValue 4939 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4940 // Let legalize expand this if it isn't a legal type yet. 4941 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4942 return SDValue(); 4943 4944 SDValue Value, OverflowCmp; 4945 SDValue ARMcc; 4946 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4947 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4948 SDLoc dl(Op); 4949 // We use 0 and 1 as false and true values. 4950 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4951 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4952 EVT VT = Op.getValueType(); 4953 4954 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4955 ARMcc, CCR, OverflowCmp); 4956 4957 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4958 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4959 } 4960 4961 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4962 SelectionDAG &DAG) { 4963 SDLoc DL(BoolCarry); 4964 EVT CarryVT = BoolCarry.getValueType(); 4965 4966 // This converts the boolean value carry into the carry flag by doing 4967 // ARMISD::SUBC Carry, 1 4968 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4969 DAG.getVTList(CarryVT, MVT::i32), 4970 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4971 return Carry.getValue(1); 4972 } 4973 4974 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4975 SelectionDAG &DAG) { 4976 SDLoc DL(Flags); 4977 4978 // Now convert the carry flag into a boolean carry. We do this 4979 // using ARMISD:ADDE 0, 0, Carry 4980 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4981 DAG.getConstant(0, DL, MVT::i32), 4982 DAG.getConstant(0, DL, MVT::i32), Flags); 4983 } 4984 4985 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4986 SelectionDAG &DAG) const { 4987 // Let legalize expand this if it isn't a legal type yet. 4988 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4989 return SDValue(); 4990 4991 SDValue LHS = Op.getOperand(0); 4992 SDValue RHS = Op.getOperand(1); 4993 SDLoc dl(Op); 4994 4995 EVT VT = Op.getValueType(); 4996 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4997 SDValue Value; 4998 SDValue Overflow; 4999 switch (Op.getOpcode()) { 5000 default: 5001 llvm_unreachable("Unknown overflow instruction!"); 5002 case ISD::UADDO: 5003 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 5004 // Convert the carry flag into a boolean value. 5005 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 5006 break; 5007 case ISD::USUBO: { 5008 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 5009 // Convert the carry flag into a boolean value. 5010 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 5011 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 5012 // value. So compute 1 - C. 5013 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 5014 DAG.getConstant(1, dl, MVT::i32), Overflow); 5015 break; 5016 } 5017 } 5018 5019 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 5020 } 5021 5022 static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, 5023 const ARMSubtarget *Subtarget) { 5024 EVT VT = Op.getValueType(); 5025 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 5026 return SDValue(); 5027 if (!VT.isSimple()) 5028 return SDValue(); 5029 5030 unsigned NewOpcode; 5031 switch (VT.getSimpleVT().SimpleTy) { 5032 default: 5033 return SDValue(); 5034 case MVT::i8: 5035 switch (Op->getOpcode()) { 5036 case ISD::UADDSAT: 5037 NewOpcode = ARMISD::UQADD8b; 5038 break; 5039 case ISD::SADDSAT: 5040 NewOpcode = ARMISD::QADD8b; 5041 break; 5042 case ISD::USUBSAT: 5043 NewOpcode = ARMISD::UQSUB8b; 5044 break; 5045 case ISD::SSUBSAT: 5046 NewOpcode = ARMISD::QSUB8b; 5047 break; 5048 } 5049 break; 5050 case MVT::i16: 5051 switch (Op->getOpcode()) { 5052 case ISD::UADDSAT: 5053 NewOpcode = ARMISD::UQADD16b; 5054 break; 5055 case ISD::SADDSAT: 5056 NewOpcode = ARMISD::QADD16b; 5057 break; 5058 case ISD::USUBSAT: 5059 NewOpcode = ARMISD::UQSUB16b; 5060 break; 5061 case ISD::SSUBSAT: 5062 NewOpcode = ARMISD::QSUB16b; 5063 break; 5064 } 5065 break; 5066 } 5067 5068 SDLoc dl(Op); 5069 SDValue Add = 5070 DAG.getNode(NewOpcode, dl, MVT::i32, 5071 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), 5072 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); 5073 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); 5074 } 5075 5076 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 5077 SDValue Cond = Op.getOperand(0); 5078 SDValue SelectTrue = Op.getOperand(1); 5079 SDValue SelectFalse = Op.getOperand(2); 5080 SDLoc dl(Op); 5081 unsigned Opc = Cond.getOpcode(); 5082 5083 if (Cond.getResNo() == 1 && 5084 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5085 Opc == ISD::USUBO)) { 5086 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5087 return SDValue(); 5088 5089 SDValue Value, OverflowCmp; 5090 SDValue ARMcc; 5091 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5092 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5093 EVT VT = Op.getValueType(); 5094 5095 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 5096 OverflowCmp, DAG); 5097 } 5098 5099 // Convert: 5100 // 5101 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 5102 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 5103 // 5104 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 5105 const ConstantSDNode *CMOVTrue = 5106 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 5107 const ConstantSDNode *CMOVFalse = 5108 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 5109 5110 if (CMOVTrue && CMOVFalse) { 5111 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 5112 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 5113 5114 SDValue True; 5115 SDValue False; 5116 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 5117 True = SelectTrue; 5118 False = SelectFalse; 5119 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 5120 True = SelectFalse; 5121 False = SelectTrue; 5122 } 5123 5124 if (True.getNode() && False.getNode()) { 5125 EVT VT = Op.getValueType(); 5126 SDValue ARMcc = Cond.getOperand(2); 5127 SDValue CCR = Cond.getOperand(3); 5128 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 5129 assert(True.getValueType() == VT); 5130 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 5131 } 5132 } 5133 } 5134 5135 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 5136 // undefined bits before doing a full-word comparison with zero. 5137 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 5138 DAG.getConstant(1, dl, Cond.getValueType())); 5139 5140 return DAG.getSelectCC(dl, Cond, 5141 DAG.getConstant(0, dl, Cond.getValueType()), 5142 SelectTrue, SelectFalse, ISD::SETNE); 5143 } 5144 5145 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 5146 bool &swpCmpOps, bool &swpVselOps) { 5147 // Start by selecting the GE condition code for opcodes that return true for 5148 // 'equality' 5149 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 5150 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 5151 CondCode = ARMCC::GE; 5152 5153 // and GT for opcodes that return false for 'equality'. 5154 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 5155 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 5156 CondCode = ARMCC::GT; 5157 5158 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 5159 // to swap the compare operands. 5160 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 5161 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 5162 swpCmpOps = true; 5163 5164 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 5165 // If we have an unordered opcode, we need to swap the operands to the VSEL 5166 // instruction (effectively negating the condition). 5167 // 5168 // This also has the effect of swapping which one of 'less' or 'greater' 5169 // returns true, so we also swap the compare operands. It also switches 5170 // whether we return true for 'equality', so we compensate by picking the 5171 // opposite condition code to our original choice. 5172 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 5173 CC == ISD::SETUGT) { 5174 swpCmpOps = !swpCmpOps; 5175 swpVselOps = !swpVselOps; 5176 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 5177 } 5178 5179 // 'ordered' is 'anything but unordered', so use the VS condition code and 5180 // swap the VSEL operands. 5181 if (CC == ISD::SETO) { 5182 CondCode = ARMCC::VS; 5183 swpVselOps = true; 5184 } 5185 5186 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 5187 // code and swap the VSEL operands. Also do this if we don't care about the 5188 // unordered case. 5189 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 5190 CondCode = ARMCC::EQ; 5191 swpVselOps = true; 5192 } 5193 } 5194 5195 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 5196 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 5197 SDValue Cmp, SelectionDAG &DAG) const { 5198 if (!Subtarget->hasFP64() && VT == MVT::f64) { 5199 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 5200 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 5201 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 5202 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 5203 5204 SDValue TrueLow = TrueVal.getValue(0); 5205 SDValue TrueHigh = TrueVal.getValue(1); 5206 SDValue FalseLow = FalseVal.getValue(0); 5207 SDValue FalseHigh = FalseVal.getValue(1); 5208 5209 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 5210 ARMcc, CCR, Cmp); 5211 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 5212 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 5213 5214 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 5215 } else { 5216 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 5217 Cmp); 5218 } 5219 } 5220 5221 static bool isGTorGE(ISD::CondCode CC) { 5222 return CC == ISD::SETGT || CC == ISD::SETGE; 5223 } 5224 5225 static bool isLTorLE(ISD::CondCode CC) { 5226 return CC == ISD::SETLT || CC == ISD::SETLE; 5227 } 5228 5229 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 5230 // All of these conditions (and their <= and >= counterparts) will do: 5231 // x < k ? k : x 5232 // x > k ? x : k 5233 // k < x ? x : k 5234 // k > x ? k : x 5235 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 5236 const SDValue TrueVal, const SDValue FalseVal, 5237 const ISD::CondCode CC, const SDValue K) { 5238 return (isGTorGE(CC) && 5239 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 5240 (isLTorLE(CC) && 5241 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 5242 } 5243 5244 // Check if two chained conditionals could be converted into SSAT or USAT. 5245 // 5246 // SSAT can replace a set of two conditional selectors that bound a number to an 5247 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 5248 // 5249 // x < -k ? -k : (x > k ? k : x) 5250 // x < -k ? -k : (x < k ? x : k) 5251 // x > -k ? (x > k ? k : x) : -k 5252 // x < k ? (x < -k ? -k : x) : k 5253 // etc. 5254 // 5255 // LLVM canonicalizes these to either a min(max()) or a max(min()) 5256 // pattern. This function tries to match one of these and will return a SSAT 5257 // node if successful. 5258 // 5259 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 5260 // is a power of 2. 5261 static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) { 5262 EVT VT = Op.getValueType(); 5263 SDValue V1 = Op.getOperand(0); 5264 SDValue K1 = Op.getOperand(1); 5265 SDValue TrueVal1 = Op.getOperand(2); 5266 SDValue FalseVal1 = Op.getOperand(3); 5267 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5268 5269 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 5270 if (Op2.getOpcode() != ISD::SELECT_CC) 5271 return SDValue(); 5272 5273 SDValue V2 = Op2.getOperand(0); 5274 SDValue K2 = Op2.getOperand(1); 5275 SDValue TrueVal2 = Op2.getOperand(2); 5276 SDValue FalseVal2 = Op2.getOperand(3); 5277 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 5278 5279 SDValue V1Tmp = V1; 5280 SDValue V2Tmp = V2; 5281 5282 // Check that the registers and the constants match a max(min()) or min(max()) 5283 // pattern 5284 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 || 5285 K2 != FalseVal2 || 5286 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2)))) 5287 return SDValue(); 5288 5289 // Check that the constant in the lower-bound check is 5290 // the opposite of the constant in the upper-bound check 5291 // in 1's complement. 5292 if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2)) 5293 return SDValue(); 5294 5295 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue(); 5296 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue(); 5297 int64_t PosVal = std::max(Val1, Val2); 5298 int64_t NegVal = std::min(Val1, Val2); 5299 5300 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) || 5301 !isPowerOf2_64(PosVal + 1)) 5302 return SDValue(); 5303 5304 // Handle the difference between USAT (unsigned) and SSAT (signed) 5305 // saturation 5306 // At this point, PosVal is guaranteed to be positive 5307 uint64_t K = PosVal; 5308 SDLoc dl(Op); 5309 if (Val1 == ~Val2) 5310 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp, 5311 DAG.getConstant(countTrailingOnes(K), dl, VT)); 5312 if (NegVal == 0) 5313 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp, 5314 DAG.getConstant(countTrailingOnes(K), dl, VT)); 5315 5316 return SDValue(); 5317 } 5318 5319 // Check if a condition of the type x < k ? k : x can be converted into a 5320 // bit operation instead of conditional moves. 5321 // Currently this is allowed given: 5322 // - The conditions and values match up 5323 // - k is 0 or -1 (all ones) 5324 // This function will not check the last condition, thats up to the caller 5325 // It returns true if the transformation can be made, and in such case 5326 // returns x in V, and k in SatK. 5327 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 5328 SDValue &SatK) 5329 { 5330 SDValue LHS = Op.getOperand(0); 5331 SDValue RHS = Op.getOperand(1); 5332 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5333 SDValue TrueVal = Op.getOperand(2); 5334 SDValue FalseVal = Op.getOperand(3); 5335 5336 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 5337 ? &RHS 5338 : nullptr; 5339 5340 // No constant operation in comparison, early out 5341 if (!K) 5342 return false; 5343 5344 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 5345 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 5346 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 5347 5348 // If the constant on left and right side, or variable on left and right, 5349 // does not match, early out 5350 if (*K != KTmp || V != VTmp) 5351 return false; 5352 5353 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 5354 SatK = *K; 5355 return true; 5356 } 5357 5358 return false; 5359 } 5360 5361 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { 5362 if (VT == MVT::f32) 5363 return !Subtarget->hasVFP2Base(); 5364 if (VT == MVT::f64) 5365 return !Subtarget->hasFP64(); 5366 if (VT == MVT::f16) 5367 return !Subtarget->hasFullFP16(); 5368 return false; 5369 } 5370 5371 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 5372 EVT VT = Op.getValueType(); 5373 SDLoc dl(Op); 5374 5375 // Try to convert two saturating conditional selects into a single SSAT 5376 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) 5377 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG)) 5378 return SatValue; 5379 5380 // Try to convert expressions of the form x < k ? k : x (and similar forms) 5381 // into more efficient bit operations, which is possible when k is 0 or -1 5382 // On ARM and Thumb-2 which have flexible operand 2 this will result in 5383 // single instructions. On Thumb the shift and the bit operation will be two 5384 // instructions. 5385 // Only allow this transformation on full-width (32-bit) operations 5386 SDValue LowerSatConstant; 5387 SDValue SatValue; 5388 if (VT == MVT::i32 && 5389 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 5390 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 5391 DAG.getConstant(31, dl, VT)); 5392 if (isNullConstant(LowerSatConstant)) { 5393 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 5394 DAG.getAllOnesConstant(dl, VT)); 5395 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 5396 } else if (isAllOnesConstant(LowerSatConstant)) 5397 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 5398 } 5399 5400 SDValue LHS = Op.getOperand(0); 5401 SDValue RHS = Op.getOperand(1); 5402 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5403 SDValue TrueVal = Op.getOperand(2); 5404 SDValue FalseVal = Op.getOperand(3); 5405 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); 5406 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); 5407 5408 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && 5409 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { 5410 unsigned TVal = CTVal->getZExtValue(); 5411 unsigned FVal = CFVal->getZExtValue(); 5412 unsigned Opcode = 0; 5413 5414 if (TVal == ~FVal) { 5415 Opcode = ARMISD::CSINV; 5416 } else if (TVal == ~FVal + 1) { 5417 Opcode = ARMISD::CSNEG; 5418 } else if (TVal + 1 == FVal) { 5419 Opcode = ARMISD::CSINC; 5420 } else if (TVal == FVal + 1) { 5421 Opcode = ARMISD::CSINC; 5422 std::swap(TrueVal, FalseVal); 5423 std::swap(TVal, FVal); 5424 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5425 } 5426 5427 if (Opcode) { 5428 // If one of the constants is cheaper than another, materialise the 5429 // cheaper one and let the csel generate the other. 5430 if (Opcode != ARMISD::CSINC && 5431 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { 5432 std::swap(TrueVal, FalseVal); 5433 std::swap(TVal, FVal); 5434 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5435 } 5436 5437 // Attempt to use ZR checking TVal is 0, possibly inverting the condition 5438 // to get there. CSINC not is invertable like the other two (~(~a) == a, 5439 // -(-a) == a, but (a+1)+1 != a). 5440 if (FVal == 0 && Opcode != ARMISD::CSINC) { 5441 std::swap(TrueVal, FalseVal); 5442 std::swap(TVal, FVal); 5443 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5444 } 5445 5446 // Drops F's value because we can get it by inverting/negating TVal. 5447 FalseVal = TrueVal; 5448 5449 SDValue ARMcc; 5450 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5451 EVT VT = TrueVal.getValueType(); 5452 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); 5453 } 5454 } 5455 5456 if (isUnsupportedFloatingType(LHS.getValueType())) { 5457 DAG.getTargetLoweringInfo().softenSetCCOperands( 5458 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5459 5460 // If softenSetCCOperands only returned one value, we should compare it to 5461 // zero. 5462 if (!RHS.getNode()) { 5463 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5464 CC = ISD::SETNE; 5465 } 5466 } 5467 5468 if (LHS.getValueType() == MVT::i32) { 5469 // Try to generate VSEL on ARMv8. 5470 // The VSEL instruction can't use all the usual ARM condition 5471 // codes: it only has two bits to select the condition code, so it's 5472 // constrained to use only GE, GT, VS and EQ. 5473 // 5474 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 5475 // swap the operands of the previous compare instruction (effectively 5476 // inverting the compare condition, swapping 'less' and 'greater') and 5477 // sometimes need to swap the operands to the VSEL (which inverts the 5478 // condition in the sense of firing whenever the previous condition didn't) 5479 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || 5480 TrueVal.getValueType() == MVT::f32 || 5481 TrueVal.getValueType() == MVT::f64)) { 5482 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5483 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 5484 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 5485 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5486 std::swap(TrueVal, FalseVal); 5487 } 5488 } 5489 5490 SDValue ARMcc; 5491 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5492 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5493 // Choose GE over PL, which vsel does now support 5494 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) 5495 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); 5496 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5497 } 5498 5499 ARMCC::CondCodes CondCode, CondCode2; 5500 FPCCToARMCC(CC, CondCode, CondCode2); 5501 5502 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 5503 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 5504 // must use VSEL (limited condition codes), due to not having conditional f16 5505 // moves. 5506 if (Subtarget->hasFPARMv8Base() && 5507 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 5508 (TrueVal.getValueType() == MVT::f16 || 5509 TrueVal.getValueType() == MVT::f32 || 5510 TrueVal.getValueType() == MVT::f64)) { 5511 bool swpCmpOps = false; 5512 bool swpVselOps = false; 5513 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 5514 5515 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 5516 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 5517 if (swpCmpOps) 5518 std::swap(LHS, RHS); 5519 if (swpVselOps) 5520 std::swap(TrueVal, FalseVal); 5521 } 5522 } 5523 5524 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5525 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5526 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5527 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5528 if (CondCode2 != ARMCC::AL) { 5529 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 5530 // FIXME: Needs another CMP because flag can have but one use. 5531 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 5532 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 5533 } 5534 return Result; 5535 } 5536 5537 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 5538 /// to morph to an integer compare sequence. 5539 static bool canChangeToInt(SDValue Op, bool &SeenZero, 5540 const ARMSubtarget *Subtarget) { 5541 SDNode *N = Op.getNode(); 5542 if (!N->hasOneUse()) 5543 // Otherwise it requires moving the value from fp to integer registers. 5544 return false; 5545 if (!N->getNumValues()) 5546 return false; 5547 EVT VT = Op.getValueType(); 5548 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 5549 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 5550 // vmrs are very slow, e.g. cortex-a8. 5551 return false; 5552 5553 if (isFloatingPointZero(Op)) { 5554 SeenZero = true; 5555 return true; 5556 } 5557 return ISD::isNormalLoad(N); 5558 } 5559 5560 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 5561 if (isFloatingPointZero(Op)) 5562 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 5563 5564 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 5565 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 5566 Ld->getPointerInfo(), Ld->getAlign(), 5567 Ld->getMemOperand()->getFlags()); 5568 5569 llvm_unreachable("Unknown VFP cmp argument!"); 5570 } 5571 5572 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 5573 SDValue &RetVal1, SDValue &RetVal2) { 5574 SDLoc dl(Op); 5575 5576 if (isFloatingPointZero(Op)) { 5577 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 5578 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 5579 return; 5580 } 5581 5582 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 5583 SDValue Ptr = Ld->getBasePtr(); 5584 RetVal1 = 5585 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 5586 Ld->getAlign(), Ld->getMemOperand()->getFlags()); 5587 5588 EVT PtrType = Ptr.getValueType(); 5589 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 5590 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 5591 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 5592 Ld->getPointerInfo().getWithOffset(4), 5593 commonAlignment(Ld->getAlign(), 4), 5594 Ld->getMemOperand()->getFlags()); 5595 return; 5596 } 5597 5598 llvm_unreachable("Unknown VFP cmp argument!"); 5599 } 5600 5601 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 5602 /// f32 and even f64 comparisons to integer ones. 5603 SDValue 5604 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 5605 SDValue Chain = Op.getOperand(0); 5606 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5607 SDValue LHS = Op.getOperand(2); 5608 SDValue RHS = Op.getOperand(3); 5609 SDValue Dest = Op.getOperand(4); 5610 SDLoc dl(Op); 5611 5612 bool LHSSeenZero = false; 5613 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 5614 bool RHSSeenZero = false; 5615 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 5616 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 5617 // If unsafe fp math optimization is enabled and there are no other uses of 5618 // the CMP operands, and the condition code is EQ or NE, we can optimize it 5619 // to an integer comparison. 5620 if (CC == ISD::SETOEQ) 5621 CC = ISD::SETEQ; 5622 else if (CC == ISD::SETUNE) 5623 CC = ISD::SETNE; 5624 5625 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5626 SDValue ARMcc; 5627 if (LHS.getValueType() == MVT::f32) { 5628 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5629 bitcastf32Toi32(LHS, DAG), Mask); 5630 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5631 bitcastf32Toi32(RHS, DAG), Mask); 5632 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5633 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5634 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5635 Chain, Dest, ARMcc, CCR, Cmp); 5636 } 5637 5638 SDValue LHS1, LHS2; 5639 SDValue RHS1, RHS2; 5640 expandf64Toi32(LHS, DAG, LHS1, LHS2); 5641 expandf64Toi32(RHS, DAG, RHS1, RHS2); 5642 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 5643 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 5644 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5645 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5646 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5647 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 5648 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 5649 } 5650 5651 return SDValue(); 5652 } 5653 5654 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 5655 SDValue Chain = Op.getOperand(0); 5656 SDValue Cond = Op.getOperand(1); 5657 SDValue Dest = Op.getOperand(2); 5658 SDLoc dl(Op); 5659 5660 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5661 // instruction. 5662 unsigned Opc = Cond.getOpcode(); 5663 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5664 !Subtarget->isThumb1Only(); 5665 if (Cond.getResNo() == 1 && 5666 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5667 Opc == ISD::USUBO || OptimizeMul)) { 5668 // Only lower legal XALUO ops. 5669 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5670 return SDValue(); 5671 5672 // The actual operation with overflow check. 5673 SDValue Value, OverflowCmp; 5674 SDValue ARMcc; 5675 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5676 5677 // Reverse the condition code. 5678 ARMCC::CondCodes CondCode = 5679 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5680 CondCode = ARMCC::getOppositeCondition(CondCode); 5681 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5682 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5683 5684 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5685 OverflowCmp); 5686 } 5687 5688 return SDValue(); 5689 } 5690 5691 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5692 SDValue Chain = Op.getOperand(0); 5693 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5694 SDValue LHS = Op.getOperand(2); 5695 SDValue RHS = Op.getOperand(3); 5696 SDValue Dest = Op.getOperand(4); 5697 SDLoc dl(Op); 5698 5699 if (isUnsupportedFloatingType(LHS.getValueType())) { 5700 DAG.getTargetLoweringInfo().softenSetCCOperands( 5701 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5702 5703 // If softenSetCCOperands only returned one value, we should compare it to 5704 // zero. 5705 if (!RHS.getNode()) { 5706 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5707 CC = ISD::SETNE; 5708 } 5709 } 5710 5711 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5712 // instruction. 5713 unsigned Opc = LHS.getOpcode(); 5714 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5715 !Subtarget->isThumb1Only(); 5716 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 5717 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5718 Opc == ISD::USUBO || OptimizeMul) && 5719 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5720 // Only lower legal XALUO ops. 5721 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5722 return SDValue(); 5723 5724 // The actual operation with overflow check. 5725 SDValue Value, OverflowCmp; 5726 SDValue ARMcc; 5727 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 5728 5729 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 5730 // Reverse the condition code. 5731 ARMCC::CondCodes CondCode = 5732 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5733 CondCode = ARMCC::getOppositeCondition(CondCode); 5734 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5735 } 5736 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5737 5738 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5739 OverflowCmp); 5740 } 5741 5742 if (LHS.getValueType() == MVT::i32) { 5743 SDValue ARMcc; 5744 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5745 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5746 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5747 Chain, Dest, ARMcc, CCR, Cmp); 5748 } 5749 5750 if (getTargetMachine().Options.UnsafeFPMath && 5751 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 5752 CC == ISD::SETNE || CC == ISD::SETUNE)) { 5753 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 5754 return Result; 5755 } 5756 5757 ARMCC::CondCodes CondCode, CondCode2; 5758 FPCCToARMCC(CC, CondCode, CondCode2); 5759 5760 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5761 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5762 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5763 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5764 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 5765 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5766 if (CondCode2 != ARMCC::AL) { 5767 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 5768 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 5769 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5770 } 5771 return Res; 5772 } 5773 5774 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 5775 SDValue Chain = Op.getOperand(0); 5776 SDValue Table = Op.getOperand(1); 5777 SDValue Index = Op.getOperand(2); 5778 SDLoc dl(Op); 5779 5780 EVT PTy = getPointerTy(DAG.getDataLayout()); 5781 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 5782 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 5783 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 5784 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 5785 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 5786 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 5787 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 5788 // which does another jump to the destination. This also makes it easier 5789 // to translate it to TBB / TBH later (Thumb2 only). 5790 // FIXME: This might not work if the function is extremely large. 5791 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 5792 Addr, Op.getOperand(2), JTI); 5793 } 5794 if (isPositionIndependent() || Subtarget->isROPI()) { 5795 Addr = 5796 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 5797 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5798 Chain = Addr.getValue(1); 5799 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 5800 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5801 } else { 5802 Addr = 5803 DAG.getLoad(PTy, dl, Chain, Addr, 5804 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5805 Chain = Addr.getValue(1); 5806 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5807 } 5808 } 5809 5810 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 5811 EVT VT = Op.getValueType(); 5812 SDLoc dl(Op); 5813 5814 if (Op.getValueType().getVectorElementType() == MVT::i32) { 5815 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 5816 return Op; 5817 return DAG.UnrollVectorOp(Op.getNode()); 5818 } 5819 5820 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16(); 5821 5822 EVT NewTy; 5823 const EVT OpTy = Op.getOperand(0).getValueType(); 5824 if (OpTy == MVT::v4f32) 5825 NewTy = MVT::v4i32; 5826 else if (OpTy == MVT::v4f16 && HasFullFP16) 5827 NewTy = MVT::v4i16; 5828 else if (OpTy == MVT::v8f16 && HasFullFP16) 5829 NewTy = MVT::v8i16; 5830 else 5831 llvm_unreachable("Invalid type for custom lowering!"); 5832 5833 if (VT != MVT::v4i16 && VT != MVT::v8i16) 5834 return DAG.UnrollVectorOp(Op.getNode()); 5835 5836 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 5837 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 5838 } 5839 5840 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 5841 EVT VT = Op.getValueType(); 5842 if (VT.isVector()) 5843 return LowerVectorFP_TO_INT(Op, DAG); 5844 5845 bool IsStrict = Op->isStrictFPOpcode(); 5846 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 5847 5848 if (isUnsupportedFloatingType(SrcVal.getValueType())) { 5849 RTLIB::Libcall LC; 5850 if (Op.getOpcode() == ISD::FP_TO_SINT || 5851 Op.getOpcode() == ISD::STRICT_FP_TO_SINT) 5852 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), 5853 Op.getValueType()); 5854 else 5855 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), 5856 Op.getValueType()); 5857 SDLoc Loc(Op); 5858 MakeLibCallOptions CallOptions; 5859 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 5860 SDValue Result; 5861 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, 5862 CallOptions, Loc, Chain); 5863 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 5864 } 5865 5866 // FIXME: Remove this when we have strict fp instruction selection patterns 5867 if (IsStrict) { 5868 SDLoc Loc(Op); 5869 SDValue Result = 5870 DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT 5871 : ISD::FP_TO_UINT, 5872 Loc, Op.getValueType(), SrcVal); 5873 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 5874 } 5875 5876 return Op; 5877 } 5878 5879 static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, 5880 const ARMSubtarget *Subtarget) { 5881 EVT VT = Op.getValueType(); 5882 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 5883 EVT FromVT = Op.getOperand(0).getValueType(); 5884 5885 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32) 5886 return Op; 5887 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 && 5888 Subtarget->hasFP64()) 5889 return Op; 5890 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 && 5891 Subtarget->hasFullFP16()) 5892 return Op; 5893 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 && 5894 Subtarget->hasMVEFloatOps()) 5895 return Op; 5896 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 && 5897 Subtarget->hasMVEFloatOps()) 5898 return Op; 5899 5900 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16) 5901 return SDValue(); 5902 5903 SDLoc DL(Op); 5904 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT; 5905 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned; 5906 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), 5907 DAG.getValueType(VT.getScalarType())); 5908 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT, 5909 DAG.getConstant((1 << BW) - 1, DL, VT)); 5910 if (IsSigned) 5911 Max = DAG.getNode(ISD::SMAX, DL, VT, Max, 5912 DAG.getConstant(-(1 << BW), DL, VT)); 5913 return Max; 5914 } 5915 5916 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5917 EVT VT = Op.getValueType(); 5918 SDLoc dl(Op); 5919 5920 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 5921 if (VT.getVectorElementType() == MVT::f32) 5922 return Op; 5923 return DAG.UnrollVectorOp(Op.getNode()); 5924 } 5925 5926 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 5927 Op.getOperand(0).getValueType() == MVT::v8i16) && 5928 "Invalid type for custom lowering!"); 5929 5930 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16(); 5931 5932 EVT DestVecType; 5933 if (VT == MVT::v4f32) 5934 DestVecType = MVT::v4i32; 5935 else if (VT == MVT::v4f16 && HasFullFP16) 5936 DestVecType = MVT::v4i16; 5937 else if (VT == MVT::v8f16 && HasFullFP16) 5938 DestVecType = MVT::v8i16; 5939 else 5940 return DAG.UnrollVectorOp(Op.getNode()); 5941 5942 unsigned CastOpc; 5943 unsigned Opc; 5944 switch (Op.getOpcode()) { 5945 default: llvm_unreachable("Invalid opcode!"); 5946 case ISD::SINT_TO_FP: 5947 CastOpc = ISD::SIGN_EXTEND; 5948 Opc = ISD::SINT_TO_FP; 5949 break; 5950 case ISD::UINT_TO_FP: 5951 CastOpc = ISD::ZERO_EXTEND; 5952 Opc = ISD::UINT_TO_FP; 5953 break; 5954 } 5955 5956 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 5957 return DAG.getNode(Opc, dl, VT, Op); 5958 } 5959 5960 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 5961 EVT VT = Op.getValueType(); 5962 if (VT.isVector()) 5963 return LowerVectorINT_TO_FP(Op, DAG); 5964 if (isUnsupportedFloatingType(VT)) { 5965 RTLIB::Libcall LC; 5966 if (Op.getOpcode() == ISD::SINT_TO_FP) 5967 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 5968 Op.getValueType()); 5969 else 5970 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 5971 Op.getValueType()); 5972 MakeLibCallOptions CallOptions; 5973 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5974 CallOptions, SDLoc(Op)).first; 5975 } 5976 5977 return Op; 5978 } 5979 5980 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5981 // Implement fcopysign with a fabs and a conditional fneg. 5982 SDValue Tmp0 = Op.getOperand(0); 5983 SDValue Tmp1 = Op.getOperand(1); 5984 SDLoc dl(Op); 5985 EVT VT = Op.getValueType(); 5986 EVT SrcVT = Tmp1.getValueType(); 5987 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 5988 Tmp0.getOpcode() == ARMISD::VMOVDRR; 5989 bool UseNEON = !InGPR && Subtarget->hasNEON(); 5990 5991 if (UseNEON) { 5992 // Use VBSL to copy the sign bit. 5993 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); 5994 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 5995 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 5996 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 5997 if (VT == MVT::f64) 5998 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5999 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 6000 DAG.getConstant(32, dl, MVT::i32)); 6001 else /*if (VT == MVT::f32)*/ 6002 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 6003 if (SrcVT == MVT::f32) { 6004 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 6005 if (VT == MVT::f64) 6006 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 6007 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 6008 DAG.getConstant(32, dl, MVT::i32)); 6009 } else if (VT == MVT::f32) 6010 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, 6011 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 6012 DAG.getConstant(32, dl, MVT::i32)); 6013 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 6014 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 6015 6016 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), 6017 dl, MVT::i32); 6018 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 6019 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 6020 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 6021 6022 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 6023 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 6024 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 6025 if (VT == MVT::f32) { 6026 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 6027 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 6028 DAG.getConstant(0, dl, MVT::i32)); 6029 } else { 6030 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 6031 } 6032 6033 return Res; 6034 } 6035 6036 // Bitcast operand 1 to i32. 6037 if (SrcVT == MVT::f64) 6038 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 6039 Tmp1).getValue(1); 6040 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 6041 6042 // Or in the signbit with integer operations. 6043 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 6044 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 6045 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 6046 if (VT == MVT::f32) { 6047 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 6048 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 6049 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 6050 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 6051 } 6052 6053 // f64: Or the high part with signbit and then combine two parts. 6054 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 6055 Tmp0); 6056 SDValue Lo = Tmp0.getValue(0); 6057 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 6058 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 6059 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 6060 } 6061 6062 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 6063 MachineFunction &MF = DAG.getMachineFunction(); 6064 MachineFrameInfo &MFI = MF.getFrameInfo(); 6065 MFI.setReturnAddressIsTaken(true); 6066 6067 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 6068 return SDValue(); 6069 6070 EVT VT = Op.getValueType(); 6071 SDLoc dl(Op); 6072 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6073 if (Depth) { 6074 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6075 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 6076 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 6077 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 6078 MachinePointerInfo()); 6079 } 6080 6081 // Return LR, which contains the return address. Mark it an implicit live-in. 6082 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 6083 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 6084 } 6085 6086 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 6087 const ARMBaseRegisterInfo &ARI = 6088 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 6089 MachineFunction &MF = DAG.getMachineFunction(); 6090 MachineFrameInfo &MFI = MF.getFrameInfo(); 6091 MFI.setFrameAddressIsTaken(true); 6092 6093 EVT VT = Op.getValueType(); 6094 SDLoc dl(Op); // FIXME probably not meaningful 6095 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6096 Register FrameReg = ARI.getFrameRegister(MF); 6097 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6098 while (Depth--) 6099 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 6100 MachinePointerInfo()); 6101 return FrameAddr; 6102 } 6103 6104 // FIXME? Maybe this could be a TableGen attribute on some registers and 6105 // this table could be generated automatically from RegInfo. 6106 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT, 6107 const MachineFunction &MF) const { 6108 Register Reg = StringSwitch<unsigned>(RegName) 6109 .Case("sp", ARM::SP) 6110 .Default(0); 6111 if (Reg) 6112 return Reg; 6113 report_fatal_error(Twine("Invalid register name \"" 6114 + StringRef(RegName) + "\".")); 6115 } 6116 6117 // Result is 64 bit value so split into two 32 bit values and return as a 6118 // pair of values. 6119 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 6120 SelectionDAG &DAG) { 6121 SDLoc DL(N); 6122 6123 // This function is only supposed to be called for i64 type destination. 6124 assert(N->getValueType(0) == MVT::i64 6125 && "ExpandREAD_REGISTER called for non-i64 type result."); 6126 6127 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 6128 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 6129 N->getOperand(0), 6130 N->getOperand(1)); 6131 6132 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 6133 Read.getValue(1))); 6134 Results.push_back(Read.getOperand(0)); 6135 } 6136 6137 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 6138 /// When \p DstVT, the destination type of \p BC, is on the vector 6139 /// register bank and the source of bitcast, \p Op, operates on the same bank, 6140 /// it might be possible to combine them, such that everything stays on the 6141 /// vector register bank. 6142 /// \p return The node that would replace \p BT, if the combine 6143 /// is possible. 6144 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 6145 SelectionDAG &DAG) { 6146 SDValue Op = BC->getOperand(0); 6147 EVT DstVT = BC->getValueType(0); 6148 6149 // The only vector instruction that can produce a scalar (remember, 6150 // since the bitcast was about to be turned into VMOVDRR, the source 6151 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 6152 // Moreover, we can do this combine only if there is one use. 6153 // Finally, if the destination type is not a vector, there is not 6154 // much point on forcing everything on the vector bank. 6155 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6156 !Op.hasOneUse()) 6157 return SDValue(); 6158 6159 // If the index is not constant, we will introduce an additional 6160 // multiply that will stick. 6161 // Give up in that case. 6162 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6163 if (!Index) 6164 return SDValue(); 6165 unsigned DstNumElt = DstVT.getVectorNumElements(); 6166 6167 // Compute the new index. 6168 const APInt &APIntIndex = Index->getAPIntValue(); 6169 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 6170 NewIndex *= APIntIndex; 6171 // Check if the new constant index fits into i32. 6172 if (NewIndex.getBitWidth() > 32) 6173 return SDValue(); 6174 6175 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 6176 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 6177 SDLoc dl(Op); 6178 SDValue ExtractSrc = Op.getOperand(0); 6179 EVT VecVT = EVT::getVectorVT( 6180 *DAG.getContext(), DstVT.getScalarType(), 6181 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 6182 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 6183 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 6184 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 6185 } 6186 6187 /// ExpandBITCAST - If the target supports VFP, this function is called to 6188 /// expand a bit convert where either the source or destination type is i64 to 6189 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 6190 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 6191 /// vectors), since the legalizer won't know what to do with that. 6192 SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 6193 const ARMSubtarget *Subtarget) const { 6194 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6195 SDLoc dl(N); 6196 SDValue Op = N->getOperand(0); 6197 6198 // This function is only supposed to be called for i16 and i64 types, either 6199 // as the source or destination of the bit convert. 6200 EVT SrcVT = Op.getValueType(); 6201 EVT DstVT = N->getValueType(0); 6202 6203 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) && 6204 (DstVT == MVT::f16 || DstVT == MVT::bf16)) 6205 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(), 6206 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op)); 6207 6208 if ((DstVT == MVT::i16 || DstVT == MVT::i32) && 6209 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) 6210 return DAG.getNode( 6211 ISD::TRUNCATE, SDLoc(N), DstVT, 6212 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op)); 6213 6214 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 6215 return SDValue(); 6216 6217 // Turn i64->f64 into VMOVDRR. 6218 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 6219 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 6220 // if we can combine the bitcast with its source. 6221 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 6222 return Val; 6223 6224 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 6225 DAG.getConstant(0, dl, MVT::i32)); 6226 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 6227 DAG.getConstant(1, dl, MVT::i32)); 6228 return DAG.getNode(ISD::BITCAST, dl, DstVT, 6229 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 6230 } 6231 6232 // Turn f64->i64 into VMOVRRD. 6233 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 6234 SDValue Cvt; 6235 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 6236 SrcVT.getVectorNumElements() > 1) 6237 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 6238 DAG.getVTList(MVT::i32, MVT::i32), 6239 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 6240 else 6241 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 6242 DAG.getVTList(MVT::i32, MVT::i32), Op); 6243 // Merge the pieces into a single i64 value. 6244 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 6245 } 6246 6247 return SDValue(); 6248 } 6249 6250 /// getZeroVector - Returns a vector of specified type with all zero elements. 6251 /// Zero vectors are used to represent vector negation and in those cases 6252 /// will be implemented with the NEON VNEG instruction. However, VNEG does 6253 /// not support i64 elements, so sometimes the zero vectors will need to be 6254 /// explicitly constructed. Regardless, use a canonical VMOV to create the 6255 /// zero vector. 6256 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 6257 assert(VT.isVector() && "Expected a vector type"); 6258 // The canonical modified immediate encoding of a zero vector is....0! 6259 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 6260 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 6261 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 6262 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 6263 } 6264 6265 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 6266 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 6267 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 6268 SelectionDAG &DAG) const { 6269 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6270 EVT VT = Op.getValueType(); 6271 unsigned VTBits = VT.getSizeInBits(); 6272 SDLoc dl(Op); 6273 SDValue ShOpLo = Op.getOperand(0); 6274 SDValue ShOpHi = Op.getOperand(1); 6275 SDValue ShAmt = Op.getOperand(2); 6276 SDValue ARMcc; 6277 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6278 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 6279 6280 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 6281 6282 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6283 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 6284 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 6285 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 6286 DAG.getConstant(VTBits, dl, MVT::i32)); 6287 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 6288 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 6289 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 6290 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6291 ISD::SETGE, ARMcc, DAG, dl); 6292 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 6293 ARMcc, CCR, CmpLo); 6294 6295 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 6296 SDValue HiBigShift = Opc == ISD::SRA 6297 ? DAG.getNode(Opc, dl, VT, ShOpHi, 6298 DAG.getConstant(VTBits - 1, dl, VT)) 6299 : DAG.getConstant(0, dl, VT); 6300 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6301 ISD::SETGE, ARMcc, DAG, dl); 6302 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 6303 ARMcc, CCR, CmpHi); 6304 6305 SDValue Ops[2] = { Lo, Hi }; 6306 return DAG.getMergeValues(Ops, dl); 6307 } 6308 6309 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 6310 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 6311 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 6312 SelectionDAG &DAG) const { 6313 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6314 EVT VT = Op.getValueType(); 6315 unsigned VTBits = VT.getSizeInBits(); 6316 SDLoc dl(Op); 6317 SDValue ShOpLo = Op.getOperand(0); 6318 SDValue ShOpHi = Op.getOperand(1); 6319 SDValue ShAmt = Op.getOperand(2); 6320 SDValue ARMcc; 6321 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6322 6323 assert(Op.getOpcode() == ISD::SHL_PARTS); 6324 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6325 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 6326 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 6327 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 6328 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 6329 6330 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 6331 DAG.getConstant(VTBits, dl, MVT::i32)); 6332 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 6333 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6334 ISD::SETGE, ARMcc, DAG, dl); 6335 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 6336 ARMcc, CCR, CmpHi); 6337 6338 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6339 ISD::SETGE, ARMcc, DAG, dl); 6340 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6341 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 6342 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 6343 6344 SDValue Ops[2] = { Lo, Hi }; 6345 return DAG.getMergeValues(Ops, dl); 6346 } 6347 6348 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 6349 SelectionDAG &DAG) const { 6350 // The rounding mode is in bits 23:22 of the FPSCR. 6351 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 6352 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 6353 // so that the shift + and get folded into a bitfield extract. 6354 SDLoc dl(Op); 6355 SDValue Chain = Op.getOperand(0); 6356 SDValue Ops[] = {Chain, 6357 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)}; 6358 6359 SDValue FPSCR = 6360 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops); 6361 Chain = FPSCR.getValue(1); 6362 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 6363 DAG.getConstant(1U << 22, dl, MVT::i32)); 6364 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 6365 DAG.getConstant(22, dl, MVT::i32)); 6366 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 6367 DAG.getConstant(3, dl, MVT::i32)); 6368 return DAG.getMergeValues({And, Chain}, dl); 6369 } 6370 6371 SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op, 6372 SelectionDAG &DAG) const { 6373 SDLoc DL(Op); 6374 SDValue Chain = Op->getOperand(0); 6375 SDValue RMValue = Op->getOperand(1); 6376 6377 // The rounding mode is in bits 23:22 of the FPSCR. 6378 // The llvm.set.rounding argument value to ARM rounding mode value mapping 6379 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is 6380 // ((arg - 1) & 3) << 22). 6381 // 6382 // It is expected that the argument of llvm.set.rounding is within the 6383 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is 6384 // responsibility of the code generated llvm.set.rounding to ensure this 6385 // condition. 6386 6387 // Calculate new value of FPSCR[23:22]. 6388 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, 6389 DAG.getConstant(1, DL, MVT::i32)); 6390 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, 6391 DAG.getConstant(0x3, DL, MVT::i32)); 6392 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, 6393 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32)); 6394 6395 // Get current value of FPSCR. 6396 SDValue Ops[] = {Chain, 6397 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)}; 6398 SDValue FPSCR = 6399 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops); 6400 Chain = FPSCR.getValue(1); 6401 FPSCR = FPSCR.getValue(0); 6402 6403 // Put new rounding mode into FPSCR[23:22]. 6404 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos); 6405 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR, 6406 DAG.getConstant(RMMask, DL, MVT::i32)); 6407 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue); 6408 SDValue Ops2[] = { 6409 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR}; 6410 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); 6411 } 6412 6413 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 6414 const ARMSubtarget *ST) { 6415 SDLoc dl(N); 6416 EVT VT = N->getValueType(0); 6417 if (VT.isVector() && ST->hasNEON()) { 6418 6419 // Compute the least significant set bit: LSB = X & -X 6420 SDValue X = N->getOperand(0); 6421 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 6422 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 6423 6424 EVT ElemTy = VT.getVectorElementType(); 6425 6426 if (ElemTy == MVT::i8) { 6427 // Compute with: cttz(x) = ctpop(lsb - 1) 6428 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6429 DAG.getTargetConstant(1, dl, ElemTy)); 6430 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6431 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6432 } 6433 6434 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 6435 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 6436 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 6437 unsigned NumBits = ElemTy.getSizeInBits(); 6438 SDValue WidthMinus1 = 6439 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6440 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 6441 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 6442 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 6443 } 6444 6445 // Compute with: cttz(x) = ctpop(lsb - 1) 6446 6447 // Compute LSB - 1. 6448 SDValue Bits; 6449 if (ElemTy == MVT::i64) { 6450 // Load constant 0xffff'ffff'ffff'ffff to register. 6451 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6452 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 6453 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 6454 } else { 6455 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6456 DAG.getTargetConstant(1, dl, ElemTy)); 6457 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6458 } 6459 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6460 } 6461 6462 if (!ST->hasV6T2Ops()) 6463 return SDValue(); 6464 6465 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 6466 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 6467 } 6468 6469 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 6470 const ARMSubtarget *ST) { 6471 EVT VT = N->getValueType(0); 6472 SDLoc DL(N); 6473 6474 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 6475 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 6476 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 6477 "Unexpected type for custom ctpop lowering"); 6478 6479 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6480 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 6481 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 6482 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 6483 6484 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 6485 unsigned EltSize = 8; 6486 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 6487 while (EltSize != VT.getScalarSizeInBits()) { 6488 SmallVector<SDValue, 8> Ops; 6489 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 6490 TLI.getPointerTy(DAG.getDataLayout()))); 6491 Ops.push_back(Res); 6492 6493 EltSize *= 2; 6494 NumElts /= 2; 6495 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 6496 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 6497 } 6498 6499 return Res; 6500 } 6501 6502 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 6503 /// operand of a vector shift operation, where all the elements of the 6504 /// build_vector must have the same constant integer value. 6505 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6506 // Ignore bit_converts. 6507 while (Op.getOpcode() == ISD::BITCAST) 6508 Op = Op.getOperand(0); 6509 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6510 APInt SplatBits, SplatUndef; 6511 unsigned SplatBitSize; 6512 bool HasAnyUndefs; 6513 if (!BVN || 6514 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 6515 ElementBits) || 6516 SplatBitSize > ElementBits) 6517 return false; 6518 Cnt = SplatBits.getSExtValue(); 6519 return true; 6520 } 6521 6522 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6523 /// operand of a vector shift left operation. That value must be in the range: 6524 /// 0 <= Value < ElementBits for a left shift; or 6525 /// 0 <= Value <= ElementBits for a long left shift. 6526 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6527 assert(VT.isVector() && "vector shift count is not a vector type"); 6528 int64_t ElementBits = VT.getScalarSizeInBits(); 6529 if (!getVShiftImm(Op, ElementBits, Cnt)) 6530 return false; 6531 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6532 } 6533 6534 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6535 /// operand of a vector shift right operation. For a shift opcode, the value 6536 /// is positive, but for an intrinsic the value count must be negative. The 6537 /// absolute value must be in the range: 6538 /// 1 <= |Value| <= ElementBits for a right shift; or 6539 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6540 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6541 int64_t &Cnt) { 6542 assert(VT.isVector() && "vector shift count is not a vector type"); 6543 int64_t ElementBits = VT.getScalarSizeInBits(); 6544 if (!getVShiftImm(Op, ElementBits, Cnt)) 6545 return false; 6546 if (!isIntrinsic) 6547 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6548 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { 6549 Cnt = -Cnt; 6550 return true; 6551 } 6552 return false; 6553 } 6554 6555 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 6556 const ARMSubtarget *ST) { 6557 EVT VT = N->getValueType(0); 6558 SDLoc dl(N); 6559 int64_t Cnt; 6560 6561 if (!VT.isVector()) 6562 return SDValue(); 6563 6564 // We essentially have two forms here. Shift by an immediate and shift by a 6565 // vector register (there are also shift by a gpr, but that is just handled 6566 // with a tablegen pattern). We cannot easily match shift by an immediate in 6567 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. 6568 // For shifting by a vector, we don't have VSHR, only VSHL (which can be 6569 // signed or unsigned, and a negative shift indicates a shift right). 6570 if (N->getOpcode() == ISD::SHL) { 6571 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 6572 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 6573 DAG.getConstant(Cnt, dl, MVT::i32)); 6574 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), 6575 N->getOperand(1)); 6576 } 6577 6578 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && 6579 "unexpected vector shift opcode"); 6580 6581 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 6582 unsigned VShiftOpc = 6583 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 6584 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 6585 DAG.getConstant(Cnt, dl, MVT::i32)); 6586 } 6587 6588 // Other right shifts we don't have operations for (we use a shift left by a 6589 // negative number). 6590 EVT ShiftVT = N->getOperand(1).getValueType(); 6591 SDValue NegatedCount = DAG.getNode( 6592 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); 6593 unsigned VShiftOpc = 6594 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); 6595 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); 6596 } 6597 6598 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 6599 const ARMSubtarget *ST) { 6600 EVT VT = N->getValueType(0); 6601 SDLoc dl(N); 6602 6603 // We can get here for a node like i32 = ISD::SHL i32, i64 6604 if (VT != MVT::i64) 6605 return SDValue(); 6606 6607 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || 6608 N->getOpcode() == ISD::SHL) && 6609 "Unknown shift to lower!"); 6610 6611 unsigned ShOpc = N->getOpcode(); 6612 if (ST->hasMVEIntegerOps()) { 6613 SDValue ShAmt = N->getOperand(1); 6614 unsigned ShPartsOpc = ARMISD::LSLL; 6615 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); 6616 6617 // If the shift amount is greater than 32 or has a greater bitwidth than 64 6618 // then do the default optimisation 6619 if (ShAmt->getValueType(0).getSizeInBits() > 64 || 6620 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) 6621 return SDValue(); 6622 6623 // Extract the lower 32 bits of the shift amount if it's not an i32 6624 if (ShAmt->getValueType(0) != MVT::i32) 6625 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); 6626 6627 if (ShOpc == ISD::SRL) { 6628 if (!Con) 6629 // There is no t2LSRLr instruction so negate and perform an lsll if the 6630 // shift amount is in a register, emulating a right shift. 6631 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6632 DAG.getConstant(0, dl, MVT::i32), ShAmt); 6633 else 6634 // Else generate an lsrl on the immediate shift amount 6635 ShPartsOpc = ARMISD::LSRL; 6636 } else if (ShOpc == ISD::SRA) 6637 ShPartsOpc = ARMISD::ASRL; 6638 6639 // Lower 32 bits of the destination/source 6640 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6641 DAG.getConstant(0, dl, MVT::i32)); 6642 // Upper 32 bits of the destination/source 6643 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6644 DAG.getConstant(1, dl, MVT::i32)); 6645 6646 // Generate the shift operation as computed above 6647 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, 6648 ShAmt); 6649 // The upper 32 bits come from the second return value of lsll 6650 Hi = SDValue(Lo.getNode(), 1); 6651 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6652 } 6653 6654 // We only lower SRA, SRL of 1 here, all others use generic lowering. 6655 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) 6656 return SDValue(); 6657 6658 // If we are in thumb mode, we don't have RRX. 6659 if (ST->isThumb1Only()) 6660 return SDValue(); 6661 6662 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 6663 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6664 DAG.getConstant(0, dl, MVT::i32)); 6665 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6666 DAG.getConstant(1, dl, MVT::i32)); 6667 6668 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 6669 // captures the result into a carry flag. 6670 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 6671 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 6672 6673 // The low part is an ARMISD::RRX operand, which shifts the carry in. 6674 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 6675 6676 // Merge the pieces into a single i64 value. 6677 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6678 } 6679 6680 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, 6681 const ARMSubtarget *ST) { 6682 bool Invert = false; 6683 bool Swap = false; 6684 unsigned Opc = ARMCC::AL; 6685 6686 SDValue Op0 = Op.getOperand(0); 6687 SDValue Op1 = Op.getOperand(1); 6688 SDValue CC = Op.getOperand(2); 6689 EVT VT = Op.getValueType(); 6690 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6691 SDLoc dl(Op); 6692 6693 EVT CmpVT; 6694 if (ST->hasNEON()) 6695 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 6696 else { 6697 assert(ST->hasMVEIntegerOps() && 6698 "No hardware support for integer vector comparison!"); 6699 6700 if (Op.getValueType().getVectorElementType() != MVT::i1) 6701 return SDValue(); 6702 6703 // Make sure we expand floating point setcc to scalar if we do not have 6704 // mve.fp, so that we can handle them from there. 6705 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) 6706 return SDValue(); 6707 6708 CmpVT = VT; 6709 } 6710 6711 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 6712 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 6713 // Special-case integer 64-bit equality comparisons. They aren't legal, 6714 // but they can be lowered with a few vector instructions. 6715 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 6716 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 6717 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 6718 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 6719 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 6720 DAG.getCondCode(ISD::SETEQ)); 6721 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 6722 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 6723 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 6724 if (SetCCOpcode == ISD::SETNE) 6725 Merged = DAG.getNOT(dl, Merged, CmpVT); 6726 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 6727 return Merged; 6728 } 6729 6730 if (CmpVT.getVectorElementType() == MVT::i64) 6731 // 64-bit comparisons are not legal in general. 6732 return SDValue(); 6733 6734 if (Op1.getValueType().isFloatingPoint()) { 6735 switch (SetCCOpcode) { 6736 default: llvm_unreachable("Illegal FP comparison"); 6737 case ISD::SETUNE: 6738 case ISD::SETNE: 6739 if (ST->hasMVEFloatOps()) { 6740 Opc = ARMCC::NE; break; 6741 } else { 6742 Invert = true; LLVM_FALLTHROUGH; 6743 } 6744 case ISD::SETOEQ: 6745 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6746 case ISD::SETOLT: 6747 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6748 case ISD::SETOGT: 6749 case ISD::SETGT: Opc = ARMCC::GT; break; 6750 case ISD::SETOLE: 6751 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6752 case ISD::SETOGE: 6753 case ISD::SETGE: Opc = ARMCC::GE; break; 6754 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 6755 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; 6756 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 6757 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; 6758 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 6759 case ISD::SETONE: { 6760 // Expand this to (OLT | OGT). 6761 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6762 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6763 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6764 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6765 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6766 if (Invert) 6767 Result = DAG.getNOT(dl, Result, VT); 6768 return Result; 6769 } 6770 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; 6771 case ISD::SETO: { 6772 // Expand this to (OLT | OGE). 6773 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6774 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6775 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6776 DAG.getConstant(ARMCC::GE, dl, MVT::i32)); 6777 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6778 if (Invert) 6779 Result = DAG.getNOT(dl, Result, VT); 6780 return Result; 6781 } 6782 } 6783 } else { 6784 // Integer comparisons. 6785 switch (SetCCOpcode) { 6786 default: llvm_unreachable("Illegal integer comparison"); 6787 case ISD::SETNE: 6788 if (ST->hasMVEIntegerOps()) { 6789 Opc = ARMCC::NE; break; 6790 } else { 6791 Invert = true; LLVM_FALLTHROUGH; 6792 } 6793 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6794 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6795 case ISD::SETGT: Opc = ARMCC::GT; break; 6796 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6797 case ISD::SETGE: Opc = ARMCC::GE; break; 6798 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 6799 case ISD::SETUGT: Opc = ARMCC::HI; break; 6800 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 6801 case ISD::SETUGE: Opc = ARMCC::HS; break; 6802 } 6803 6804 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 6805 if (ST->hasNEON() && Opc == ARMCC::EQ) { 6806 SDValue AndOp; 6807 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6808 AndOp = Op0; 6809 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 6810 AndOp = Op1; 6811 6812 // Ignore bitconvert. 6813 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 6814 AndOp = AndOp.getOperand(0); 6815 6816 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 6817 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 6818 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 6819 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); 6820 if (!Invert) 6821 Result = DAG.getNOT(dl, Result, VT); 6822 return Result; 6823 } 6824 } 6825 } 6826 6827 if (Swap) 6828 std::swap(Op0, Op1); 6829 6830 // If one of the operands is a constant vector zero, attempt to fold the 6831 // comparison to a specialized compare-against-zero form. 6832 SDValue SingleOp; 6833 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6834 SingleOp = Op0; 6835 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 6836 if (Opc == ARMCC::GE) 6837 Opc = ARMCC::LE; 6838 else if (Opc == ARMCC::GT) 6839 Opc = ARMCC::LT; 6840 SingleOp = Op1; 6841 } 6842 6843 SDValue Result; 6844 if (SingleOp.getNode()) { 6845 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, 6846 DAG.getConstant(Opc, dl, MVT::i32)); 6847 } else { 6848 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6849 DAG.getConstant(Opc, dl, MVT::i32)); 6850 } 6851 6852 Result = DAG.getSExtOrTrunc(Result, dl, VT); 6853 6854 if (Invert) 6855 Result = DAG.getNOT(dl, Result, VT); 6856 6857 return Result; 6858 } 6859 6860 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 6861 SDValue LHS = Op.getOperand(0); 6862 SDValue RHS = Op.getOperand(1); 6863 SDValue Carry = Op.getOperand(2); 6864 SDValue Cond = Op.getOperand(3); 6865 SDLoc DL(Op); 6866 6867 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 6868 6869 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 6870 // have to invert the carry first. 6871 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 6872 DAG.getConstant(1, DL, MVT::i32), Carry); 6873 // This converts the boolean value carry into the carry flag. 6874 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 6875 6876 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 6877 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 6878 6879 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 6880 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 6881 SDValue ARMcc = DAG.getConstant( 6882 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 6883 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6884 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 6885 Cmp.getValue(1), SDValue()); 6886 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 6887 CCR, Chain.getValue(1)); 6888 } 6889 6890 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a 6891 /// valid vector constant for a NEON or MVE instruction with a "modified 6892 /// immediate" operand (e.g., VMOV). If so, return the encoded value. 6893 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 6894 unsigned SplatBitSize, SelectionDAG &DAG, 6895 const SDLoc &dl, EVT &VT, EVT VectorVT, 6896 VMOVModImmType type) { 6897 unsigned OpCmode, Imm; 6898 bool is128Bits = VectorVT.is128BitVector(); 6899 6900 // SplatBitSize is set to the smallest size that splats the vector, so a 6901 // zero vector will always have SplatBitSize == 8. However, NEON modified 6902 // immediate instructions others than VMOV do not support the 8-bit encoding 6903 // of a zero vector, and the default encoding of zero is supposed to be the 6904 // 32-bit version. 6905 if (SplatBits == 0) 6906 SplatBitSize = 32; 6907 6908 switch (SplatBitSize) { 6909 case 8: 6910 if (type != VMOVModImm) 6911 return SDValue(); 6912 // Any 1-byte value is OK. Op=0, Cmode=1110. 6913 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 6914 OpCmode = 0xe; 6915 Imm = SplatBits; 6916 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 6917 break; 6918 6919 case 16: 6920 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 6921 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 6922 if ((SplatBits & ~0xff) == 0) { 6923 // Value = 0x00nn: Op=x, Cmode=100x. 6924 OpCmode = 0x8; 6925 Imm = SplatBits; 6926 break; 6927 } 6928 if ((SplatBits & ~0xff00) == 0) { 6929 // Value = 0xnn00: Op=x, Cmode=101x. 6930 OpCmode = 0xa; 6931 Imm = SplatBits >> 8; 6932 break; 6933 } 6934 return SDValue(); 6935 6936 case 32: 6937 // NEON's 32-bit VMOV supports splat values where: 6938 // * only one byte is nonzero, or 6939 // * the least significant byte is 0xff and the second byte is nonzero, or 6940 // * the least significant 2 bytes are 0xff and the third is nonzero. 6941 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 6942 if ((SplatBits & ~0xff) == 0) { 6943 // Value = 0x000000nn: Op=x, Cmode=000x. 6944 OpCmode = 0; 6945 Imm = SplatBits; 6946 break; 6947 } 6948 if ((SplatBits & ~0xff00) == 0) { 6949 // Value = 0x0000nn00: Op=x, Cmode=001x. 6950 OpCmode = 0x2; 6951 Imm = SplatBits >> 8; 6952 break; 6953 } 6954 if ((SplatBits & ~0xff0000) == 0) { 6955 // Value = 0x00nn0000: Op=x, Cmode=010x. 6956 OpCmode = 0x4; 6957 Imm = SplatBits >> 16; 6958 break; 6959 } 6960 if ((SplatBits & ~0xff000000) == 0) { 6961 // Value = 0xnn000000: Op=x, Cmode=011x. 6962 OpCmode = 0x6; 6963 Imm = SplatBits >> 24; 6964 break; 6965 } 6966 6967 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 6968 if (type == OtherModImm) return SDValue(); 6969 6970 if ((SplatBits & ~0xffff) == 0 && 6971 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 6972 // Value = 0x0000nnff: Op=x, Cmode=1100. 6973 OpCmode = 0xc; 6974 Imm = SplatBits >> 8; 6975 break; 6976 } 6977 6978 // cmode == 0b1101 is not supported for MVE VMVN 6979 if (type == MVEVMVNModImm) 6980 return SDValue(); 6981 6982 if ((SplatBits & ~0xffffff) == 0 && 6983 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 6984 // Value = 0x00nnffff: Op=x, Cmode=1101. 6985 OpCmode = 0xd; 6986 Imm = SplatBits >> 16; 6987 break; 6988 } 6989 6990 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 6991 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 6992 // VMOV.I32. A (very) minor optimization would be to replicate the value 6993 // and fall through here to test for a valid 64-bit splat. But, then the 6994 // caller would also need to check and handle the change in size. 6995 return SDValue(); 6996 6997 case 64: { 6998 if (type != VMOVModImm) 6999 return SDValue(); 7000 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 7001 uint64_t BitMask = 0xff; 7002 unsigned ImmMask = 1; 7003 Imm = 0; 7004 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 7005 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 7006 Imm |= ImmMask; 7007 } else if ((SplatBits & BitMask) != 0) { 7008 return SDValue(); 7009 } 7010 BitMask <<= 8; 7011 ImmMask <<= 1; 7012 } 7013 7014 if (DAG.getDataLayout().isBigEndian()) { 7015 // Reverse the order of elements within the vector. 7016 unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8; 7017 unsigned Mask = (1 << BytesPerElem) - 1; 7018 unsigned NumElems = 8 / BytesPerElem; 7019 unsigned NewImm = 0; 7020 for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) { 7021 unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask); 7022 NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem; 7023 } 7024 Imm = NewImm; 7025 } 7026 7027 // Op=1, Cmode=1110. 7028 OpCmode = 0x1e; 7029 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 7030 break; 7031 } 7032 7033 default: 7034 llvm_unreachable("unexpected size for isVMOVModifiedImm"); 7035 } 7036 7037 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); 7038 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 7039 } 7040 7041 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 7042 const ARMSubtarget *ST) const { 7043 EVT VT = Op.getValueType(); 7044 bool IsDouble = (VT == MVT::f64); 7045 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 7046 const APFloat &FPVal = CFP->getValueAPF(); 7047 7048 // Prevent floating-point constants from using literal loads 7049 // when execute-only is enabled. 7050 if (ST->genExecuteOnly()) { 7051 // If we can represent the constant as an immediate, don't lower it 7052 if (isFPImmLegal(FPVal, VT)) 7053 return Op; 7054 // Otherwise, construct as integer, and move to float register 7055 APInt INTVal = FPVal.bitcastToAPInt(); 7056 SDLoc DL(CFP); 7057 switch (VT.getSimpleVT().SimpleTy) { 7058 default: 7059 llvm_unreachable("Unknown floating point type!"); 7060 break; 7061 case MVT::f64: { 7062 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 7063 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 7064 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 7065 } 7066 case MVT::f32: 7067 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 7068 DAG.getConstant(INTVal, DL, MVT::i32)); 7069 } 7070 } 7071 7072 if (!ST->hasVFP3Base()) 7073 return SDValue(); 7074 7075 // Use the default (constant pool) lowering for double constants when we have 7076 // an SP-only FPU 7077 if (IsDouble && !Subtarget->hasFP64()) 7078 return SDValue(); 7079 7080 // Try splatting with a VMOV.f32... 7081 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 7082 7083 if (ImmVal != -1) { 7084 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 7085 // We have code in place to select a valid ConstantFP already, no need to 7086 // do any mangling. 7087 return Op; 7088 } 7089 7090 // It's a float and we are trying to use NEON operations where 7091 // possible. Lower it to a splat followed by an extract. 7092 SDLoc DL(Op); 7093 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 7094 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 7095 NewVal); 7096 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 7097 DAG.getConstant(0, DL, MVT::i32)); 7098 } 7099 7100 // The rest of our options are NEON only, make sure that's allowed before 7101 // proceeding.. 7102 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 7103 return SDValue(); 7104 7105 EVT VMovVT; 7106 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 7107 7108 // It wouldn't really be worth bothering for doubles except for one very 7109 // important value, which does happen to match: 0.0. So make sure we don't do 7110 // anything stupid. 7111 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 7112 return SDValue(); 7113 7114 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 7115 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 7116 VMovVT, VT, VMOVModImm); 7117 if (NewVal != SDValue()) { 7118 SDLoc DL(Op); 7119 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 7120 NewVal); 7121 if (IsDouble) 7122 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 7123 7124 // It's a float: cast and extract a vector element. 7125 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 7126 VecConstant); 7127 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 7128 DAG.getConstant(0, DL, MVT::i32)); 7129 } 7130 7131 // Finally, try a VMVN.i32 7132 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 7133 VT, VMVNModImm); 7134 if (NewVal != SDValue()) { 7135 SDLoc DL(Op); 7136 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 7137 7138 if (IsDouble) 7139 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 7140 7141 // It's a float: cast and extract a vector element. 7142 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 7143 VecConstant); 7144 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 7145 DAG.getConstant(0, DL, MVT::i32)); 7146 } 7147 7148 return SDValue(); 7149 } 7150 7151 // check if an VEXT instruction can handle the shuffle mask when the 7152 // vector sources of the shuffle are the same. 7153 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 7154 unsigned NumElts = VT.getVectorNumElements(); 7155 7156 // Assume that the first shuffle index is not UNDEF. Fail if it is. 7157 if (M[0] < 0) 7158 return false; 7159 7160 Imm = M[0]; 7161 7162 // If this is a VEXT shuffle, the immediate value is the index of the first 7163 // element. The other shuffle indices must be the successive elements after 7164 // the first one. 7165 unsigned ExpectedElt = Imm; 7166 for (unsigned i = 1; i < NumElts; ++i) { 7167 // Increment the expected index. If it wraps around, just follow it 7168 // back to index zero and keep going. 7169 ++ExpectedElt; 7170 if (ExpectedElt == NumElts) 7171 ExpectedElt = 0; 7172 7173 if (M[i] < 0) continue; // ignore UNDEF indices 7174 if (ExpectedElt != static_cast<unsigned>(M[i])) 7175 return false; 7176 } 7177 7178 return true; 7179 } 7180 7181 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 7182 bool &ReverseVEXT, unsigned &Imm) { 7183 unsigned NumElts = VT.getVectorNumElements(); 7184 ReverseVEXT = false; 7185 7186 // Assume that the first shuffle index is not UNDEF. Fail if it is. 7187 if (M[0] < 0) 7188 return false; 7189 7190 Imm = M[0]; 7191 7192 // If this is a VEXT shuffle, the immediate value is the index of the first 7193 // element. The other shuffle indices must be the successive elements after 7194 // the first one. 7195 unsigned ExpectedElt = Imm; 7196 for (unsigned i = 1; i < NumElts; ++i) { 7197 // Increment the expected index. If it wraps around, it may still be 7198 // a VEXT but the source vectors must be swapped. 7199 ExpectedElt += 1; 7200 if (ExpectedElt == NumElts * 2) { 7201 ExpectedElt = 0; 7202 ReverseVEXT = true; 7203 } 7204 7205 if (M[i] < 0) continue; // ignore UNDEF indices 7206 if (ExpectedElt != static_cast<unsigned>(M[i])) 7207 return false; 7208 } 7209 7210 // Adjust the index value if the source operands will be swapped. 7211 if (ReverseVEXT) 7212 Imm -= NumElts; 7213 7214 return true; 7215 } 7216 7217 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 7218 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 7219 // range, then 0 is placed into the resulting vector. So pretty much any mask 7220 // of 8 elements can work here. 7221 return VT == MVT::v8i8 && M.size() == 8; 7222 } 7223 7224 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 7225 unsigned Index) { 7226 if (Mask.size() == Elements * 2) 7227 return Index / Elements; 7228 return Mask[Index] == 0 ? 0 : 1; 7229 } 7230 7231 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 7232 // checking that pairs of elements in the shuffle mask represent the same index 7233 // in each vector, incrementing the expected index by 2 at each step. 7234 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 7235 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 7236 // v2={e,f,g,h} 7237 // WhichResult gives the offset for each element in the mask based on which 7238 // of the two results it belongs to. 7239 // 7240 // The transpose can be represented either as: 7241 // result1 = shufflevector v1, v2, result1_shuffle_mask 7242 // result2 = shufflevector v1, v2, result2_shuffle_mask 7243 // where v1/v2 and the shuffle masks have the same number of elements 7244 // (here WhichResult (see below) indicates which result is being checked) 7245 // 7246 // or as: 7247 // results = shufflevector v1, v2, shuffle_mask 7248 // where both results are returned in one vector and the shuffle mask has twice 7249 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 7250 // want to check the low half and high half of the shuffle mask as if it were 7251 // the other case 7252 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7253 unsigned EltSz = VT.getScalarSizeInBits(); 7254 if (EltSz == 64) 7255 return false; 7256 7257 unsigned NumElts = VT.getVectorNumElements(); 7258 if (M.size() != NumElts && M.size() != NumElts*2) 7259 return false; 7260 7261 // If the mask is twice as long as the input vector then we need to check the 7262 // upper and lower parts of the mask with a matching value for WhichResult 7263 // FIXME: A mask with only even values will be rejected in case the first 7264 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 7265 // M[0] is used to determine WhichResult 7266 for (unsigned i = 0; i < M.size(); i += NumElts) { 7267 WhichResult = SelectPairHalf(NumElts, M, i); 7268 for (unsigned j = 0; j < NumElts; j += 2) { 7269 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 7270 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 7271 return false; 7272 } 7273 } 7274 7275 if (M.size() == NumElts*2) 7276 WhichResult = 0; 7277 7278 return true; 7279 } 7280 7281 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 7282 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7283 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 7284 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7285 unsigned EltSz = VT.getScalarSizeInBits(); 7286 if (EltSz == 64) 7287 return false; 7288 7289 unsigned NumElts = VT.getVectorNumElements(); 7290 if (M.size() != NumElts && M.size() != NumElts*2) 7291 return false; 7292 7293 for (unsigned i = 0; i < M.size(); i += NumElts) { 7294 WhichResult = SelectPairHalf(NumElts, M, i); 7295 for (unsigned j = 0; j < NumElts; j += 2) { 7296 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 7297 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 7298 return false; 7299 } 7300 } 7301 7302 if (M.size() == NumElts*2) 7303 WhichResult = 0; 7304 7305 return true; 7306 } 7307 7308 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 7309 // that the mask elements are either all even and in steps of size 2 or all odd 7310 // and in steps of size 2. 7311 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 7312 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 7313 // v2={e,f,g,h} 7314 // Requires similar checks to that of isVTRNMask with 7315 // respect the how results are returned. 7316 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7317 unsigned EltSz = VT.getScalarSizeInBits(); 7318 if (EltSz == 64) 7319 return false; 7320 7321 unsigned NumElts = VT.getVectorNumElements(); 7322 if (M.size() != NumElts && M.size() != NumElts*2) 7323 return false; 7324 7325 for (unsigned i = 0; i < M.size(); i += NumElts) { 7326 WhichResult = SelectPairHalf(NumElts, M, i); 7327 for (unsigned j = 0; j < NumElts; ++j) { 7328 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 7329 return false; 7330 } 7331 } 7332 7333 if (M.size() == NumElts*2) 7334 WhichResult = 0; 7335 7336 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7337 if (VT.is64BitVector() && EltSz == 32) 7338 return false; 7339 7340 return true; 7341 } 7342 7343 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 7344 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7345 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 7346 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7347 unsigned EltSz = VT.getScalarSizeInBits(); 7348 if (EltSz == 64) 7349 return false; 7350 7351 unsigned NumElts = VT.getVectorNumElements(); 7352 if (M.size() != NumElts && M.size() != NumElts*2) 7353 return false; 7354 7355 unsigned Half = NumElts / 2; 7356 for (unsigned i = 0; i < M.size(); i += NumElts) { 7357 WhichResult = SelectPairHalf(NumElts, M, i); 7358 for (unsigned j = 0; j < NumElts; j += Half) { 7359 unsigned Idx = WhichResult; 7360 for (unsigned k = 0; k < Half; ++k) { 7361 int MIdx = M[i + j + k]; 7362 if (MIdx >= 0 && (unsigned) MIdx != Idx) 7363 return false; 7364 Idx += 2; 7365 } 7366 } 7367 } 7368 7369 if (M.size() == NumElts*2) 7370 WhichResult = 0; 7371 7372 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7373 if (VT.is64BitVector() && EltSz == 32) 7374 return false; 7375 7376 return true; 7377 } 7378 7379 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 7380 // that pairs of elements of the shufflemask represent the same index in each 7381 // vector incrementing sequentially through the vectors. 7382 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 7383 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 7384 // v2={e,f,g,h} 7385 // Requires similar checks to that of isVTRNMask with respect the how results 7386 // are returned. 7387 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7388 unsigned EltSz = VT.getScalarSizeInBits(); 7389 if (EltSz == 64) 7390 return false; 7391 7392 unsigned NumElts = VT.getVectorNumElements(); 7393 if (M.size() != NumElts && M.size() != NumElts*2) 7394 return false; 7395 7396 for (unsigned i = 0; i < M.size(); i += NumElts) { 7397 WhichResult = SelectPairHalf(NumElts, M, i); 7398 unsigned Idx = WhichResult * NumElts / 2; 7399 for (unsigned j = 0; j < NumElts; j += 2) { 7400 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7401 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 7402 return false; 7403 Idx += 1; 7404 } 7405 } 7406 7407 if (M.size() == NumElts*2) 7408 WhichResult = 0; 7409 7410 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7411 if (VT.is64BitVector() && EltSz == 32) 7412 return false; 7413 7414 return true; 7415 } 7416 7417 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 7418 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7419 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 7420 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7421 unsigned EltSz = VT.getScalarSizeInBits(); 7422 if (EltSz == 64) 7423 return false; 7424 7425 unsigned NumElts = VT.getVectorNumElements(); 7426 if (M.size() != NumElts && M.size() != NumElts*2) 7427 return false; 7428 7429 for (unsigned i = 0; i < M.size(); i += NumElts) { 7430 WhichResult = SelectPairHalf(NumElts, M, i); 7431 unsigned Idx = WhichResult * NumElts / 2; 7432 for (unsigned j = 0; j < NumElts; j += 2) { 7433 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7434 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 7435 return false; 7436 Idx += 1; 7437 } 7438 } 7439 7440 if (M.size() == NumElts*2) 7441 WhichResult = 0; 7442 7443 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7444 if (VT.is64BitVector() && EltSz == 32) 7445 return false; 7446 7447 return true; 7448 } 7449 7450 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 7451 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 7452 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 7453 unsigned &WhichResult, 7454 bool &isV_UNDEF) { 7455 isV_UNDEF = false; 7456 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 7457 return ARMISD::VTRN; 7458 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 7459 return ARMISD::VUZP; 7460 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 7461 return ARMISD::VZIP; 7462 7463 isV_UNDEF = true; 7464 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7465 return ARMISD::VTRN; 7466 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7467 return ARMISD::VUZP; 7468 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7469 return ARMISD::VZIP; 7470 7471 return 0; 7472 } 7473 7474 /// \return true if this is a reverse operation on an vector. 7475 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 7476 unsigned NumElts = VT.getVectorNumElements(); 7477 // Make sure the mask has the right size. 7478 if (NumElts != M.size()) 7479 return false; 7480 7481 // Look for <15, ..., 3, -1, 1, 0>. 7482 for (unsigned i = 0; i != NumElts; ++i) 7483 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 7484 return false; 7485 7486 return true; 7487 } 7488 7489 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) { 7490 unsigned NumElts = VT.getVectorNumElements(); 7491 // Make sure the mask has the right size. 7492 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) 7493 return false; 7494 7495 // If Top 7496 // Look for <0, N, 2, N+2, 4, N+4, ..>. 7497 // This inserts Input2 into Input1 7498 // else if not Top 7499 // Look for <0, N+1, 2, N+3, 4, N+5, ..> 7500 // This inserts Input1 into Input2 7501 unsigned Offset = Top ? 0 : 1; 7502 unsigned N = SingleSource ? 0 : NumElts; 7503 for (unsigned i = 0; i < NumElts; i += 2) { 7504 if (M[i] >= 0 && M[i] != (int)i) 7505 return false; 7506 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset)) 7507 return false; 7508 } 7509 7510 return true; 7511 } 7512 7513 static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) { 7514 unsigned NumElts = ToVT.getVectorNumElements(); 7515 if (NumElts != M.size()) 7516 return false; 7517 7518 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are 7519 // looking for patterns of: 7520 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ... 7521 // rev: N/2 0 N/2+1 1 N/2+2 2 ... 7522 7523 unsigned Off0 = rev ? NumElts / 2 : 0; 7524 unsigned Off1 = rev ? 0 : NumElts / 2; 7525 for (unsigned i = 0; i < NumElts; i += 2) { 7526 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) 7527 return false; 7528 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) 7529 return false; 7530 } 7531 7532 return true; 7533 } 7534 7535 // Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted 7536 // from a pair of inputs. For example: 7537 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), 7538 // FP_ROUND(EXTRACT_ELT(Y, 0), 7539 // FP_ROUND(EXTRACT_ELT(X, 1), 7540 // FP_ROUND(EXTRACT_ELT(Y, 1), ...) 7541 static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, 7542 const ARMSubtarget *ST) { 7543 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7544 if (!ST->hasMVEFloatOps()) 7545 return SDValue(); 7546 7547 SDLoc dl(BV); 7548 EVT VT = BV.getValueType(); 7549 if (VT != MVT::v8f16) 7550 return SDValue(); 7551 7552 // We are looking for a buildvector of fptrunc elements, where all the 7553 // elements are interleavingly extracted from two sources. Check the first two 7554 // items are valid enough and extract some info from them (they are checked 7555 // properly in the loop below). 7556 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND || 7557 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 7558 BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0) 7559 return SDValue(); 7560 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND || 7561 BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 7562 BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0) 7563 return SDValue(); 7564 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); 7565 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0); 7566 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32) 7567 return SDValue(); 7568 7569 // Check all the values in the BuildVector line up with our expectations. 7570 for (unsigned i = 1; i < 4; i++) { 7571 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { 7572 return Trunc.getOpcode() == ISD::FP_ROUND && 7573 Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7574 Trunc.getOperand(0).getOperand(0) == Op && 7575 Trunc.getOperand(0).getConstantOperandVal(1) == Idx; 7576 }; 7577 if (!Check(BV.getOperand(i * 2 + 0), Op0, i)) 7578 return SDValue(); 7579 if (!Check(BV.getOperand(i * 2 + 1), Op1, i)) 7580 return SDValue(); 7581 } 7582 7583 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0, 7584 DAG.getConstant(0, dl, MVT::i32)); 7585 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1, 7586 DAG.getConstant(1, dl, MVT::i32)); 7587 } 7588 7589 // Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted 7590 // from a single input on alternating lanes. For example: 7591 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), 7592 // FP_ROUND(EXTRACT_ELT(X, 2), 7593 // FP_ROUND(EXTRACT_ELT(X, 4), ...) 7594 static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, 7595 const ARMSubtarget *ST) { 7596 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7597 if (!ST->hasMVEFloatOps()) 7598 return SDValue(); 7599 7600 SDLoc dl(BV); 7601 EVT VT = BV.getValueType(); 7602 if (VT != MVT::v4f32) 7603 return SDValue(); 7604 7605 // We are looking for a buildvector of fptext elements, where all the 7606 // elements are alternating lanes from a single source. For example <0,2,4,6> 7607 // or <1,3,5,7>. Check the first two items are valid enough and extract some 7608 // info from them (they are checked properly in the loop below). 7609 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND || 7610 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT) 7611 return SDValue(); 7612 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); 7613 int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1); 7614 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1)) 7615 return SDValue(); 7616 7617 // Check all the values in the BuildVector line up with our expectations. 7618 for (unsigned i = 1; i < 4; i++) { 7619 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { 7620 return Trunc.getOpcode() == ISD::FP_EXTEND && 7621 Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7622 Trunc.getOperand(0).getOperand(0) == Op && 7623 Trunc.getOperand(0).getConstantOperandVal(1) == Idx; 7624 }; 7625 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset)) 7626 return SDValue(); 7627 } 7628 7629 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0, 7630 DAG.getConstant(Offset, dl, MVT::i32)); 7631 } 7632 7633 // If N is an integer constant that can be moved into a register in one 7634 // instruction, return an SDValue of such a constant (will become a MOV 7635 // instruction). Otherwise return null. 7636 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 7637 const ARMSubtarget *ST, const SDLoc &dl) { 7638 uint64_t Val; 7639 if (!isa<ConstantSDNode>(N)) 7640 return SDValue(); 7641 Val = cast<ConstantSDNode>(N)->getZExtValue(); 7642 7643 if (ST->isThumb1Only()) { 7644 if (Val <= 255 || ~Val <= 255) 7645 return DAG.getConstant(Val, dl, MVT::i32); 7646 } else { 7647 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 7648 return DAG.getConstant(Val, dl, MVT::i32); 7649 } 7650 return SDValue(); 7651 } 7652 7653 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, 7654 const ARMSubtarget *ST) { 7655 SDLoc dl(Op); 7656 EVT VT = Op.getValueType(); 7657 7658 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); 7659 7660 unsigned NumElts = VT.getVectorNumElements(); 7661 unsigned BoolMask; 7662 unsigned BitsPerBool; 7663 if (NumElts == 2) { 7664 BitsPerBool = 8; 7665 BoolMask = 0xff; 7666 } else if (NumElts == 4) { 7667 BitsPerBool = 4; 7668 BoolMask = 0xf; 7669 } else if (NumElts == 8) { 7670 BitsPerBool = 2; 7671 BoolMask = 0x3; 7672 } else if (NumElts == 16) { 7673 BitsPerBool = 1; 7674 BoolMask = 0x1; 7675 } else 7676 return SDValue(); 7677 7678 // If this is a single value copied into all lanes (a splat), we can just sign 7679 // extend that single value 7680 SDValue FirstOp = Op.getOperand(0); 7681 if (!isa<ConstantSDNode>(FirstOp) && 7682 std::all_of(std::next(Op->op_begin()), Op->op_end(), 7683 [&FirstOp](SDUse &U) { 7684 return U.get().isUndef() || U.get() == FirstOp; 7685 })) { 7686 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, 7687 DAG.getValueType(MVT::i1)); 7688 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); 7689 } 7690 7691 // First create base with bits set where known 7692 unsigned Bits32 = 0; 7693 for (unsigned i = 0; i < NumElts; ++i) { 7694 SDValue V = Op.getOperand(i); 7695 if (!isa<ConstantSDNode>(V) && !V.isUndef()) 7696 continue; 7697 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); 7698 if (BitSet) 7699 Bits32 |= BoolMask << (i * BitsPerBool); 7700 } 7701 7702 // Add in unknown nodes 7703 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 7704 DAG.getConstant(Bits32, dl, MVT::i32)); 7705 for (unsigned i = 0; i < NumElts; ++i) { 7706 SDValue V = Op.getOperand(i); 7707 if (isa<ConstantSDNode>(V) || V.isUndef()) 7708 continue; 7709 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, 7710 DAG.getConstant(i, dl, MVT::i32)); 7711 } 7712 7713 return Base; 7714 } 7715 7716 static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, 7717 const ARMSubtarget *ST) { 7718 if (!ST->hasMVEIntegerOps()) 7719 return SDValue(); 7720 7721 // We are looking for a buildvector where each element is Op[0] + i*N 7722 EVT VT = Op.getValueType(); 7723 SDValue Op0 = Op.getOperand(0); 7724 unsigned NumElts = VT.getVectorNumElements(); 7725 7726 // Get the increment value from operand 1 7727 SDValue Op1 = Op.getOperand(1); 7728 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 || 7729 !isa<ConstantSDNode>(Op1.getOperand(1))) 7730 return SDValue(); 7731 unsigned N = Op1.getConstantOperandVal(1); 7732 if (N != 1 && N != 2 && N != 4 && N != 8) 7733 return SDValue(); 7734 7735 // Check that each other operand matches 7736 for (unsigned I = 2; I < NumElts; I++) { 7737 SDValue OpI = Op.getOperand(I); 7738 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 || 7739 !isa<ConstantSDNode>(OpI.getOperand(1)) || 7740 OpI.getConstantOperandVal(1) != I * N) 7741 return SDValue(); 7742 } 7743 7744 SDLoc DL(Op); 7745 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0, 7746 DAG.getConstant(N, DL, MVT::i32)); 7747 } 7748 7749 // Returns true if the operation N can be treated as qr instruction variant at 7750 // operand Op. 7751 static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) { 7752 switch (N->getOpcode()) { 7753 case ISD::ADD: 7754 case ISD::MUL: 7755 case ISD::SADDSAT: 7756 case ISD::UADDSAT: 7757 return true; 7758 case ISD::SUB: 7759 case ISD::SSUBSAT: 7760 case ISD::USUBSAT: 7761 return N->getOperand(1).getNode() == Op; 7762 case ISD::INTRINSIC_WO_CHAIN: 7763 switch (N->getConstantOperandVal(0)) { 7764 case Intrinsic::arm_mve_add_predicated: 7765 case Intrinsic::arm_mve_mul_predicated: 7766 case Intrinsic::arm_mve_qadd_predicated: 7767 case Intrinsic::arm_mve_vhadd: 7768 case Intrinsic::arm_mve_hadd_predicated: 7769 case Intrinsic::arm_mve_vqdmulh: 7770 case Intrinsic::arm_mve_qdmulh_predicated: 7771 case Intrinsic::arm_mve_vqrdmulh: 7772 case Intrinsic::arm_mve_qrdmulh_predicated: 7773 case Intrinsic::arm_mve_vqdmull: 7774 case Intrinsic::arm_mve_vqdmull_predicated: 7775 return true; 7776 case Intrinsic::arm_mve_sub_predicated: 7777 case Intrinsic::arm_mve_qsub_predicated: 7778 case Intrinsic::arm_mve_vhsub: 7779 case Intrinsic::arm_mve_hsub_predicated: 7780 return N->getOperand(2).getNode() == Op; 7781 default: 7782 return false; 7783 } 7784 default: 7785 return false; 7786 } 7787 } 7788 7789 // If this is a case we can't handle, return null and let the default 7790 // expansion code take care of it. 7791 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 7792 const ARMSubtarget *ST) const { 7793 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7794 SDLoc dl(Op); 7795 EVT VT = Op.getValueType(); 7796 7797 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 7798 return LowerBUILD_VECTOR_i1(Op, DAG, ST); 7799 7800 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST)) 7801 return R; 7802 7803 APInt SplatBits, SplatUndef; 7804 unsigned SplatBitSize; 7805 bool HasAnyUndefs; 7806 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7807 if (SplatUndef.isAllOnes()) 7808 return DAG.getUNDEF(VT); 7809 7810 // If all the users of this constant splat are qr instruction variants, 7811 // generate a vdup of the constant. 7812 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize && 7813 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) && 7814 all_of(BVN->uses(), 7815 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) { 7816 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32 7817 : SplatBitSize == 16 ? MVT::v8i16 7818 : MVT::v16i8; 7819 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32); 7820 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const); 7821 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup); 7822 } 7823 7824 if ((ST->hasNEON() && SplatBitSize <= 64) || 7825 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) { 7826 // Check if an immediate VMOV works. 7827 EVT VmovVT; 7828 SDValue Val = 7829 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 7830 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm); 7831 7832 if (Val.getNode()) { 7833 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 7834 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7835 } 7836 7837 // Try an immediate VMVN. 7838 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 7839 Val = isVMOVModifiedImm( 7840 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, 7841 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); 7842 if (Val.getNode()) { 7843 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 7844 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7845 } 7846 7847 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 7848 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 7849 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 7850 if (ImmVal != -1) { 7851 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 7852 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 7853 } 7854 } 7855 7856 // If we are under MVE, generate a VDUP(constant), bitcast to the original 7857 // type. 7858 if (ST->hasMVEIntegerOps() && 7859 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) { 7860 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32 7861 : SplatBitSize == 16 ? MVT::v8i16 7862 : MVT::v16i8; 7863 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32); 7864 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const); 7865 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup); 7866 } 7867 } 7868 } 7869 7870 // Scan through the operands to see if only one value is used. 7871 // 7872 // As an optimisation, even if more than one value is used it may be more 7873 // profitable to splat with one value then change some lanes. 7874 // 7875 // Heuristically we decide to do this if the vector has a "dominant" value, 7876 // defined as splatted to more than half of the lanes. 7877 unsigned NumElts = VT.getVectorNumElements(); 7878 bool isOnlyLowElement = true; 7879 bool usesOnlyOneValue = true; 7880 bool hasDominantValue = false; 7881 bool isConstant = true; 7882 7883 // Map of the number of times a particular SDValue appears in the 7884 // element list. 7885 DenseMap<SDValue, unsigned> ValueCounts; 7886 SDValue Value; 7887 for (unsigned i = 0; i < NumElts; ++i) { 7888 SDValue V = Op.getOperand(i); 7889 if (V.isUndef()) 7890 continue; 7891 if (i > 0) 7892 isOnlyLowElement = false; 7893 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7894 isConstant = false; 7895 7896 ValueCounts.insert(std::make_pair(V, 0)); 7897 unsigned &Count = ValueCounts[V]; 7898 7899 // Is this value dominant? (takes up more than half of the lanes) 7900 if (++Count > (NumElts / 2)) { 7901 hasDominantValue = true; 7902 Value = V; 7903 } 7904 } 7905 if (ValueCounts.size() != 1) 7906 usesOnlyOneValue = false; 7907 if (!Value.getNode() && !ValueCounts.empty()) 7908 Value = ValueCounts.begin()->first; 7909 7910 if (ValueCounts.empty()) 7911 return DAG.getUNDEF(VT); 7912 7913 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 7914 // Keep going if we are hitting this case. 7915 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 7916 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7917 7918 unsigned EltSize = VT.getScalarSizeInBits(); 7919 7920 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 7921 // i32 and try again. 7922 if (hasDominantValue && EltSize <= 32) { 7923 if (!isConstant) { 7924 SDValue N; 7925 7926 // If we are VDUPing a value that comes directly from a vector, that will 7927 // cause an unnecessary move to and from a GPR, where instead we could 7928 // just use VDUPLANE. We can only do this if the lane being extracted 7929 // is at a constant index, as the VDUP from lane instructions only have 7930 // constant-index forms. 7931 ConstantSDNode *constIndex; 7932 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7933 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 7934 // We need to create a new undef vector to use for the VDUPLANE if the 7935 // size of the vector from which we get the value is different than the 7936 // size of the vector that we need to create. We will insert the element 7937 // such that the register coalescer will remove unnecessary copies. 7938 if (VT != Value->getOperand(0).getValueType()) { 7939 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 7940 VT.getVectorNumElements(); 7941 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7942 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 7943 Value, DAG.getConstant(index, dl, MVT::i32)), 7944 DAG.getConstant(index, dl, MVT::i32)); 7945 } else 7946 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7947 Value->getOperand(0), Value->getOperand(1)); 7948 } else 7949 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 7950 7951 if (!usesOnlyOneValue) { 7952 // The dominant value was splatted as 'N', but we now have to insert 7953 // all differing elements. 7954 for (unsigned I = 0; I < NumElts; ++I) { 7955 if (Op.getOperand(I) == Value) 7956 continue; 7957 SmallVector<SDValue, 3> Ops; 7958 Ops.push_back(N); 7959 Ops.push_back(Op.getOperand(I)); 7960 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 7961 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 7962 } 7963 } 7964 return N; 7965 } 7966 if (VT.getVectorElementType().isFloatingPoint()) { 7967 SmallVector<SDValue, 8> Ops; 7968 MVT FVT = VT.getVectorElementType().getSimpleVT(); 7969 assert(FVT == MVT::f32 || FVT == MVT::f16); 7970 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; 7971 for (unsigned i = 0; i < NumElts; ++i) 7972 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, 7973 Op.getOperand(i))); 7974 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); 7975 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7976 Val = LowerBUILD_VECTOR(Val, DAG, ST); 7977 if (Val.getNode()) 7978 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7979 } 7980 if (usesOnlyOneValue) { 7981 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 7982 if (isConstant && Val.getNode()) 7983 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 7984 } 7985 } 7986 7987 // If all elements are constants and the case above didn't get hit, fall back 7988 // to the default expansion, which will generate a load from the constant 7989 // pool. 7990 if (isConstant) 7991 return SDValue(); 7992 7993 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and 7994 // vmovn). Empirical tests suggest this is rarely worth it for vectors of 7995 // length <= 2. 7996 if (NumElts >= 4) 7997 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 7998 return shuffle; 7999 8000 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into 8001 // VCVT's 8002 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget)) 8003 return VCVT; 8004 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget)) 8005 return VCVT; 8006 8007 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 8008 // If we haven't found an efficient lowering, try splitting a 128-bit vector 8009 // into two 64-bit vectors; we might discover a better way to lower it. 8010 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 8011 EVT ExtVT = VT.getVectorElementType(); 8012 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 8013 SDValue Lower = 8014 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 8015 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 8016 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 8017 SDValue Upper = DAG.getBuildVector( 8018 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 8019 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 8020 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 8021 if (Lower && Upper) 8022 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 8023 } 8024 8025 // Vectors with 32- or 64-bit elements can be built by directly assigning 8026 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 8027 // will be legalized. 8028 if (EltSize >= 32) { 8029 // Do the expansion with floating-point types, since that is what the VFP 8030 // registers are defined to use, and since i64 is not legal. 8031 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8032 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8033 SmallVector<SDValue, 8> Ops; 8034 for (unsigned i = 0; i < NumElts; ++i) 8035 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 8036 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8037 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8038 } 8039 8040 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 8041 // know the default expansion would otherwise fall back on something even 8042 // worse. For a vector with one or two non-undef values, that's 8043 // scalar_to_vector for the elements followed by a shuffle (provided the 8044 // shuffle is valid for the target) and materialization element by element 8045 // on the stack followed by a load for everything else. 8046 if (!isConstant && !usesOnlyOneValue) { 8047 SDValue Vec = DAG.getUNDEF(VT); 8048 for (unsigned i = 0 ; i < NumElts; ++i) { 8049 SDValue V = Op.getOperand(i); 8050 if (V.isUndef()) 8051 continue; 8052 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 8053 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 8054 } 8055 return Vec; 8056 } 8057 8058 return SDValue(); 8059 } 8060 8061 // Gather data to see if the operation can be modelled as a 8062 // shuffle in combination with VEXTs. 8063 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 8064 SelectionDAG &DAG) const { 8065 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 8066 SDLoc dl(Op); 8067 EVT VT = Op.getValueType(); 8068 unsigned NumElts = VT.getVectorNumElements(); 8069 8070 struct ShuffleSourceInfo { 8071 SDValue Vec; 8072 unsigned MinElt = std::numeric_limits<unsigned>::max(); 8073 unsigned MaxElt = 0; 8074 8075 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 8076 // be compatible with the shuffle we intend to construct. As a result 8077 // ShuffleVec will be some sliding window into the original Vec. 8078 SDValue ShuffleVec; 8079 8080 // Code should guarantee that element i in Vec starts at element "WindowBase 8081 // + i * WindowScale in ShuffleVec". 8082 int WindowBase = 0; 8083 int WindowScale = 1; 8084 8085 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 8086 8087 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 8088 }; 8089 8090 // First gather all vectors used as an immediate source for this BUILD_VECTOR 8091 // node. 8092 SmallVector<ShuffleSourceInfo, 2> Sources; 8093 for (unsigned i = 0; i < NumElts; ++i) { 8094 SDValue V = Op.getOperand(i); 8095 if (V.isUndef()) 8096 continue; 8097 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 8098 // A shuffle can only come from building a vector from various 8099 // elements of other vectors. 8100 return SDValue(); 8101 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 8102 // Furthermore, shuffles require a constant mask, whereas extractelts 8103 // accept variable indices. 8104 return SDValue(); 8105 } 8106 8107 // Add this element source to the list if it's not already there. 8108 SDValue SourceVec = V.getOperand(0); 8109 auto Source = llvm::find(Sources, SourceVec); 8110 if (Source == Sources.end()) 8111 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 8112 8113 // Update the minimum and maximum lane number seen. 8114 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 8115 Source->MinElt = std::min(Source->MinElt, EltNo); 8116 Source->MaxElt = std::max(Source->MaxElt, EltNo); 8117 } 8118 8119 // Currently only do something sane when at most two source vectors 8120 // are involved. 8121 if (Sources.size() > 2) 8122 return SDValue(); 8123 8124 // Find out the smallest element size among result and two sources, and use 8125 // it as element size to build the shuffle_vector. 8126 EVT SmallestEltTy = VT.getVectorElementType(); 8127 for (auto &Source : Sources) { 8128 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 8129 if (SrcEltTy.bitsLT(SmallestEltTy)) 8130 SmallestEltTy = SrcEltTy; 8131 } 8132 unsigned ResMultiplier = 8133 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 8134 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 8135 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 8136 8137 // If the source vector is too wide or too narrow, we may nevertheless be able 8138 // to construct a compatible shuffle either by concatenating it with UNDEF or 8139 // extracting a suitable range of elements. 8140 for (auto &Src : Sources) { 8141 EVT SrcVT = Src.ShuffleVec.getValueType(); 8142 8143 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); 8144 uint64_t VTSize = VT.getFixedSizeInBits(); 8145 if (SrcVTSize == VTSize) 8146 continue; 8147 8148 // This stage of the search produces a source with the same element type as 8149 // the original, but with a total width matching the BUILD_VECTOR output. 8150 EVT EltVT = SrcVT.getVectorElementType(); 8151 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); 8152 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 8153 8154 if (SrcVTSize < VTSize) { 8155 if (2 * SrcVTSize != VTSize) 8156 return SDValue(); 8157 // We can pad out the smaller vector for free, so if it's part of a 8158 // shuffle... 8159 Src.ShuffleVec = 8160 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 8161 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 8162 continue; 8163 } 8164 8165 if (SrcVTSize != 2 * VTSize) 8166 return SDValue(); 8167 8168 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 8169 // Span too large for a VEXT to cope 8170 return SDValue(); 8171 } 8172 8173 if (Src.MinElt >= NumSrcElts) { 8174 // The extraction can just take the second half 8175 Src.ShuffleVec = 8176 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8177 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 8178 Src.WindowBase = -NumSrcElts; 8179 } else if (Src.MaxElt < NumSrcElts) { 8180 // The extraction can just take the first half 8181 Src.ShuffleVec = 8182 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8183 DAG.getConstant(0, dl, MVT::i32)); 8184 } else { 8185 // An actual VEXT is needed 8186 SDValue VEXTSrc1 = 8187 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8188 DAG.getConstant(0, dl, MVT::i32)); 8189 SDValue VEXTSrc2 = 8190 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 8191 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 8192 8193 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 8194 VEXTSrc2, 8195 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 8196 Src.WindowBase = -Src.MinElt; 8197 } 8198 } 8199 8200 // Another possible incompatibility occurs from the vector element types. We 8201 // can fix this by bitcasting the source vectors to the same type we intend 8202 // for the shuffle. 8203 for (auto &Src : Sources) { 8204 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 8205 if (SrcEltTy == SmallestEltTy) 8206 continue; 8207 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 8208 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec); 8209 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 8210 Src.WindowBase *= Src.WindowScale; 8211 } 8212 8213 // Final check before we try to actually produce a shuffle. 8214 LLVM_DEBUG(for (auto Src 8215 : Sources) 8216 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 8217 8218 // The stars all align, our next step is to produce the mask for the shuffle. 8219 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 8220 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 8221 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 8222 SDValue Entry = Op.getOperand(i); 8223 if (Entry.isUndef()) 8224 continue; 8225 8226 auto Src = llvm::find(Sources, Entry.getOperand(0)); 8227 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 8228 8229 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 8230 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 8231 // segment. 8232 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 8233 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), 8234 VT.getScalarSizeInBits()); 8235 int LanesDefined = BitsDefined / BitsPerShuffleLane; 8236 8237 // This source is expected to fill ResMultiplier lanes of the final shuffle, 8238 // starting at the appropriate offset. 8239 int *LaneMask = &Mask[i * ResMultiplier]; 8240 8241 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 8242 ExtractBase += NumElts * (Src - Sources.begin()); 8243 for (int j = 0; j < LanesDefined; ++j) 8244 LaneMask[j] = ExtractBase + j; 8245 } 8246 8247 8248 // We can't handle more than two sources. This should have already 8249 // been checked before this point. 8250 assert(Sources.size() <= 2 && "Too many sources!"); 8251 8252 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 8253 for (unsigned i = 0; i < Sources.size(); ++i) 8254 ShuffleOps[i] = Sources[i].ShuffleVec; 8255 8256 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 8257 ShuffleOps[1], Mask, DAG); 8258 if (!Shuffle) 8259 return SDValue(); 8260 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle); 8261 } 8262 8263 enum ShuffleOpCodes { 8264 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 8265 OP_VREV, 8266 OP_VDUP0, 8267 OP_VDUP1, 8268 OP_VDUP2, 8269 OP_VDUP3, 8270 OP_VEXT1, 8271 OP_VEXT2, 8272 OP_VEXT3, 8273 OP_VUZPL, // VUZP, left result 8274 OP_VUZPR, // VUZP, right result 8275 OP_VZIPL, // VZIP, left result 8276 OP_VZIPR, // VZIP, right result 8277 OP_VTRNL, // VTRN, left result 8278 OP_VTRNR // VTRN, right result 8279 }; 8280 8281 static bool isLegalMVEShuffleOp(unsigned PFEntry) { 8282 unsigned OpNum = (PFEntry >> 26) & 0x0F; 8283 switch (OpNum) { 8284 case OP_COPY: 8285 case OP_VREV: 8286 case OP_VDUP0: 8287 case OP_VDUP1: 8288 case OP_VDUP2: 8289 case OP_VDUP3: 8290 return true; 8291 } 8292 return false; 8293 } 8294 8295 /// isShuffleMaskLegal - Targets can use this to indicate that they only 8296 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8297 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8298 /// are assumed to be legal. 8299 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 8300 if (VT.getVectorNumElements() == 4 && 8301 (VT.is128BitVector() || VT.is64BitVector())) { 8302 unsigned PFIndexes[4]; 8303 for (unsigned i = 0; i != 4; ++i) { 8304 if (M[i] < 0) 8305 PFIndexes[i] = 8; 8306 else 8307 PFIndexes[i] = M[i]; 8308 } 8309 8310 // Compute the index in the perfect shuffle table. 8311 unsigned PFTableIndex = 8312 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8313 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8314 unsigned Cost = (PFEntry >> 30); 8315 8316 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) 8317 return true; 8318 } 8319 8320 bool ReverseVEXT, isV_UNDEF; 8321 unsigned Imm, WhichResult; 8322 8323 unsigned EltSize = VT.getScalarSizeInBits(); 8324 if (EltSize >= 32 || 8325 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8326 ShuffleVectorInst::isIdentityMask(M) || 8327 isVREVMask(M, VT, 64) || 8328 isVREVMask(M, VT, 32) || 8329 isVREVMask(M, VT, 16)) 8330 return true; 8331 else if (Subtarget->hasNEON() && 8332 (isVEXTMask(M, VT, ReverseVEXT, Imm) || 8333 isVTBLMask(M, VT) || 8334 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) 8335 return true; 8336 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 8337 isReverseMask(M, VT)) 8338 return true; 8339 else if (Subtarget->hasMVEIntegerOps() && 8340 (isVMOVNMask(M, VT, true, false) || 8341 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true))) 8342 return true; 8343 else 8344 return false; 8345 } 8346 8347 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 8348 /// the specified operations to build the shuffle. 8349 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 8350 SDValue RHS, SelectionDAG &DAG, 8351 const SDLoc &dl) { 8352 unsigned OpNum = (PFEntry >> 26) & 0x0F; 8353 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8354 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8355 8356 if (OpNum == OP_COPY) { 8357 if (LHSID == (1*9+2)*9+3) return LHS; 8358 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 8359 return RHS; 8360 } 8361 8362 SDValue OpLHS, OpRHS; 8363 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 8364 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 8365 EVT VT = OpLHS.getValueType(); 8366 8367 switch (OpNum) { 8368 default: llvm_unreachable("Unknown shuffle opcode!"); 8369 case OP_VREV: 8370 // VREV divides the vector in half and swaps within the half. 8371 if (VT.getVectorElementType() == MVT::i32 || 8372 VT.getVectorElementType() == MVT::f32) 8373 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 8374 // vrev <4 x i16> -> VREV32 8375 if (VT.getVectorElementType() == MVT::i16 || 8376 VT.getVectorElementType() == MVT::f16) 8377 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 8378 // vrev <4 x i8> -> VREV16 8379 assert(VT.getVectorElementType() == MVT::i8); 8380 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 8381 case OP_VDUP0: 8382 case OP_VDUP1: 8383 case OP_VDUP2: 8384 case OP_VDUP3: 8385 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 8386 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 8387 case OP_VEXT1: 8388 case OP_VEXT2: 8389 case OP_VEXT3: 8390 return DAG.getNode(ARMISD::VEXT, dl, VT, 8391 OpLHS, OpRHS, 8392 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 8393 case OP_VUZPL: 8394 case OP_VUZPR: 8395 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 8396 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 8397 case OP_VZIPL: 8398 case OP_VZIPR: 8399 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 8400 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 8401 case OP_VTRNL: 8402 case OP_VTRNR: 8403 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 8404 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 8405 } 8406 } 8407 8408 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 8409 ArrayRef<int> ShuffleMask, 8410 SelectionDAG &DAG) { 8411 // Check to see if we can use the VTBL instruction. 8412 SDValue V1 = Op.getOperand(0); 8413 SDValue V2 = Op.getOperand(1); 8414 SDLoc DL(Op); 8415 8416 SmallVector<SDValue, 8> VTBLMask; 8417 for (int I : ShuffleMask) 8418 VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32)); 8419 8420 if (V2.getNode()->isUndef()) 8421 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 8422 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 8423 8424 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 8425 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 8426 } 8427 8428 static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 8429 SDLoc DL(Op); 8430 EVT VT = Op.getValueType(); 8431 8432 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 8433 "Expect an v8i16/v16i8 type"); 8434 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0)); 8435 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now, 8436 // extract the first 8 bytes into the top double word and the last 8 bytes 8437 // into the bottom double word, through a new vector shuffle that will be 8438 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE. 8439 std::vector<int> NewMask; 8440 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) 8441 NewMask.push_back(VT.getVectorNumElements() / 2 + i); 8442 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) 8443 NewMask.push_back(i); 8444 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask); 8445 } 8446 8447 static EVT getVectorTyFromPredicateVector(EVT VT) { 8448 switch (VT.getSimpleVT().SimpleTy) { 8449 case MVT::v2i1: 8450 return MVT::v2f64; 8451 case MVT::v4i1: 8452 return MVT::v4i32; 8453 case MVT::v8i1: 8454 return MVT::v8i16; 8455 case MVT::v16i1: 8456 return MVT::v16i8; 8457 default: 8458 llvm_unreachable("Unexpected vector predicate type"); 8459 } 8460 } 8461 8462 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, 8463 SelectionDAG &DAG) { 8464 // Converting from boolean predicates to integers involves creating a vector 8465 // of all ones or all zeroes and selecting the lanes based upon the real 8466 // predicate. 8467 SDValue AllOnes = 8468 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); 8469 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); 8470 8471 SDValue AllZeroes = 8472 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); 8473 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); 8474 8475 // Get full vector type from predicate type 8476 EVT NewVT = getVectorTyFromPredicateVector(VT); 8477 8478 SDValue RecastV1; 8479 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast 8480 // this to a v16i1. This cannot be done with an ordinary bitcast because the 8481 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, 8482 // since we know in hardware the sizes are really the same. 8483 if (VT != MVT::v16i1) 8484 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); 8485 else 8486 RecastV1 = Pred; 8487 8488 // Select either all ones or zeroes depending upon the real predicate bits. 8489 SDValue PredAsVector = 8490 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); 8491 8492 // Recast our new predicate-as-integer v16i8 vector into something 8493 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. 8494 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); 8495 } 8496 8497 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, 8498 const ARMSubtarget *ST) { 8499 EVT VT = Op.getValueType(); 8500 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 8501 ArrayRef<int> ShuffleMask = SVN->getMask(); 8502 8503 assert(ST->hasMVEIntegerOps() && 8504 "No support for vector shuffle of boolean predicates"); 8505 8506 SDValue V1 = Op.getOperand(0); 8507 SDLoc dl(Op); 8508 if (isReverseMask(ShuffleMask, VT)) { 8509 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); 8510 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); 8511 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, 8512 DAG.getConstant(16, dl, MVT::i32)); 8513 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); 8514 } 8515 8516 // Until we can come up with optimised cases for every single vector 8517 // shuffle in existence we have chosen the least painful strategy. This is 8518 // to essentially promote the boolean predicate to a 8-bit integer, where 8519 // each predicate represents a byte. Then we fall back on a normal integer 8520 // vector shuffle and convert the result back into a predicate vector. In 8521 // many cases the generated code might be even better than scalar code 8522 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit 8523 // fields in a register into 8 other arbitrary 2-bit fields! 8524 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); 8525 EVT NewVT = PredAsVector.getValueType(); 8526 8527 // Do the shuffle! 8528 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, 8529 DAG.getUNDEF(NewVT), ShuffleMask); 8530 8531 // Now return the result of comparing the shuffled vector with zero, 8532 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 8533 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s. 8534 if (VT == MVT::v2i1) { 8535 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled); 8536 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC, 8537 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8538 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); 8539 } 8540 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, 8541 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8542 } 8543 8544 static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, 8545 ArrayRef<int> ShuffleMask, 8546 SelectionDAG &DAG) { 8547 // Attempt to lower the vector shuffle using as many whole register movs as 8548 // possible. This is useful for types smaller than 32bits, which would 8549 // often otherwise become a series for grp movs. 8550 SDLoc dl(Op); 8551 EVT VT = Op.getValueType(); 8552 if (VT.getScalarSizeInBits() >= 32) 8553 return SDValue(); 8554 8555 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 8556 "Unexpected vector type"); 8557 int NumElts = VT.getVectorNumElements(); 8558 int QuarterSize = NumElts / 4; 8559 // The four final parts of the vector, as i32's 8560 SDValue Parts[4]; 8561 8562 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not 8563 // <u,u,u,u>), returning the vmov lane index 8564 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) { 8565 // Detect which mov lane this would be from the first non-undef element. 8566 int MovIdx = -1; 8567 for (int i = 0; i < Length; i++) { 8568 if (ShuffleMask[Start + i] >= 0) { 8569 if (ShuffleMask[Start + i] % Length != i) 8570 return -1; 8571 MovIdx = ShuffleMask[Start + i] / Length; 8572 break; 8573 } 8574 } 8575 // If all items are undef, leave this for other combines 8576 if (MovIdx == -1) 8577 return -1; 8578 // Check the remaining values are the correct part of the same mov 8579 for (int i = 1; i < Length; i++) { 8580 if (ShuffleMask[Start + i] >= 0 && 8581 (ShuffleMask[Start + i] / Length != MovIdx || 8582 ShuffleMask[Start + i] % Length != i)) 8583 return -1; 8584 } 8585 return MovIdx; 8586 }; 8587 8588 for (int Part = 0; Part < 4; ++Part) { 8589 // Does this part look like a mov 8590 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); 8591 if (Elt != -1) { 8592 SDValue Input = Op->getOperand(0); 8593 if (Elt >= 4) { 8594 Input = Op->getOperand(1); 8595 Elt -= 4; 8596 } 8597 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input); 8598 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast, 8599 DAG.getConstant(Elt, dl, MVT::i32)); 8600 } 8601 } 8602 8603 // Nothing interesting found, just return 8604 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) 8605 return SDValue(); 8606 8607 // The other parts need to be built with the old shuffle vector, cast to a 8608 // v4i32 and extract_vector_elts 8609 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { 8610 SmallVector<int, 16> NewShuffleMask; 8611 for (int Part = 0; Part < 4; ++Part) 8612 for (int i = 0; i < QuarterSize; i++) 8613 NewShuffleMask.push_back( 8614 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); 8615 SDValue NewShuffle = DAG.getVectorShuffle( 8616 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); 8617 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle); 8618 8619 for (int Part = 0; Part < 4; ++Part) 8620 if (!Parts[Part]) 8621 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, 8622 BitCast, DAG.getConstant(Part, dl, MVT::i32)); 8623 } 8624 // Build a vector out of the various parts and bitcast it back to the original 8625 // type. 8626 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts); 8627 return DAG.getBitcast(VT, NewVec); 8628 } 8629 8630 static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, 8631 ArrayRef<int> ShuffleMask, 8632 SelectionDAG &DAG) { 8633 SDValue V1 = Op.getOperand(0); 8634 SDValue V2 = Op.getOperand(1); 8635 EVT VT = Op.getValueType(); 8636 unsigned NumElts = VT.getVectorNumElements(); 8637 8638 // An One-Off Identity mask is one that is mostly an identity mask from as 8639 // single source but contains a single element out-of-place, either from a 8640 // different vector or from another position in the same vector. As opposed to 8641 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert 8642 // pair directly. 8643 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset, 8644 int &OffElement) { 8645 OffElement = -1; 8646 int NonUndef = 0; 8647 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) { 8648 if (Mask[i] == -1) 8649 continue; 8650 NonUndef++; 8651 if (Mask[i] != i + BaseOffset) { 8652 if (OffElement == -1) 8653 OffElement = i; 8654 else 8655 return false; 8656 } 8657 } 8658 return NonUndef > 2 && OffElement != -1; 8659 }; 8660 int OffElement; 8661 SDValue VInput; 8662 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement)) 8663 VInput = V1; 8664 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement)) 8665 VInput = V2; 8666 else 8667 return SDValue(); 8668 8669 SDLoc dl(Op); 8670 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16 8671 ? MVT::i32 8672 : VT.getScalarType(); 8673 SDValue Elt = DAG.getNode( 8674 ISD::EXTRACT_VECTOR_ELT, dl, SVT, 8675 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2, 8676 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl)); 8677 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt, 8678 DAG.getVectorIdxConstant(OffElement % NumElts, dl)); 8679 } 8680 8681 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 8682 const ARMSubtarget *ST) { 8683 SDValue V1 = Op.getOperand(0); 8684 SDValue V2 = Op.getOperand(1); 8685 SDLoc dl(Op); 8686 EVT VT = Op.getValueType(); 8687 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 8688 unsigned EltSize = VT.getScalarSizeInBits(); 8689 8690 if (ST->hasMVEIntegerOps() && EltSize == 1) 8691 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); 8692 8693 // Convert shuffles that are directly supported on NEON to target-specific 8694 // DAG nodes, instead of keeping them as shuffles and matching them again 8695 // during code selection. This is more efficient and avoids the possibility 8696 // of inconsistencies between legalization and selection. 8697 // FIXME: floating-point vectors should be canonicalized to integer vectors 8698 // of the same time so that they get CSEd properly. 8699 ArrayRef<int> ShuffleMask = SVN->getMask(); 8700 8701 if (EltSize <= 32) { 8702 if (SVN->isSplat()) { 8703 int Lane = SVN->getSplatIndex(); 8704 // If this is undef splat, generate it via "just" vdup, if possible. 8705 if (Lane == -1) Lane = 0; 8706 8707 // Test if V1 is a SCALAR_TO_VECTOR. 8708 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 8709 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 8710 } 8711 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 8712 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 8713 // reaches it). 8714 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 8715 !isa<ConstantSDNode>(V1.getOperand(0))) { 8716 bool IsScalarToVector = true; 8717 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 8718 if (!V1.getOperand(i).isUndef()) { 8719 IsScalarToVector = false; 8720 break; 8721 } 8722 if (IsScalarToVector) 8723 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 8724 } 8725 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 8726 DAG.getConstant(Lane, dl, MVT::i32)); 8727 } 8728 8729 bool ReverseVEXT = false; 8730 unsigned Imm = 0; 8731 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 8732 if (ReverseVEXT) 8733 std::swap(V1, V2); 8734 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 8735 DAG.getConstant(Imm, dl, MVT::i32)); 8736 } 8737 8738 if (isVREVMask(ShuffleMask, VT, 64)) 8739 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 8740 if (isVREVMask(ShuffleMask, VT, 32)) 8741 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 8742 if (isVREVMask(ShuffleMask, VT, 16)) 8743 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 8744 8745 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 8746 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 8747 DAG.getConstant(Imm, dl, MVT::i32)); 8748 } 8749 8750 // Check for Neon shuffles that modify both input vectors in place. 8751 // If both results are used, i.e., if there are two shuffles with the same 8752 // source operands and with masks corresponding to both results of one of 8753 // these operations, DAG memoization will ensure that a single node is 8754 // used for both shuffles. 8755 unsigned WhichResult = 0; 8756 bool isV_UNDEF = false; 8757 if (ST->hasNEON()) { 8758 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8759 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 8760 if (isV_UNDEF) 8761 V2 = V1; 8762 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 8763 .getValue(WhichResult); 8764 } 8765 } 8766 if (ST->hasMVEIntegerOps()) { 8767 if (isVMOVNMask(ShuffleMask, VT, false, false)) 8768 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, 8769 DAG.getConstant(0, dl, MVT::i32)); 8770 if (isVMOVNMask(ShuffleMask, VT, true, false)) 8771 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, 8772 DAG.getConstant(1, dl, MVT::i32)); 8773 if (isVMOVNMask(ShuffleMask, VT, true, true)) 8774 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1, 8775 DAG.getConstant(1, dl, MVT::i32)); 8776 } 8777 8778 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 8779 // shuffles that produce a result larger than their operands with: 8780 // shuffle(concat(v1, undef), concat(v2, undef)) 8781 // -> 8782 // shuffle(concat(v1, v2), undef) 8783 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 8784 // 8785 // This is useful in the general case, but there are special cases where 8786 // native shuffles produce larger results: the two-result ops. 8787 // 8788 // Look through the concat when lowering them: 8789 // shuffle(concat(v1, v2), undef) 8790 // -> 8791 // concat(VZIP(v1, v2):0, :1) 8792 // 8793 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 8794 SDValue SubV1 = V1->getOperand(0); 8795 SDValue SubV2 = V1->getOperand(1); 8796 EVT SubVT = SubV1.getValueType(); 8797 8798 // We expect these to have been canonicalized to -1. 8799 assert(llvm::all_of(ShuffleMask, [&](int i) { 8800 return i < (int)VT.getVectorNumElements(); 8801 }) && "Unexpected shuffle index into UNDEF operand!"); 8802 8803 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8804 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 8805 if (isV_UNDEF) 8806 SubV2 = SubV1; 8807 assert((WhichResult == 0) && 8808 "In-place shuffle of concat can only have one result!"); 8809 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 8810 SubV1, SubV2); 8811 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 8812 Res.getValue(1)); 8813 } 8814 } 8815 } 8816 8817 if (ST->hasMVEIntegerOps() && EltSize <= 32) 8818 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG)) 8819 return V; 8820 8821 // If the shuffle is not directly supported and it has 4 elements, use 8822 // the PerfectShuffle-generated table to synthesize it from other shuffles. 8823 unsigned NumElts = VT.getVectorNumElements(); 8824 if (NumElts == 4) { 8825 unsigned PFIndexes[4]; 8826 for (unsigned i = 0; i != 4; ++i) { 8827 if (ShuffleMask[i] < 0) 8828 PFIndexes[i] = 8; 8829 else 8830 PFIndexes[i] = ShuffleMask[i]; 8831 } 8832 8833 // Compute the index in the perfect shuffle table. 8834 unsigned PFTableIndex = 8835 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8836 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8837 unsigned Cost = (PFEntry >> 30); 8838 8839 if (Cost <= 4) { 8840 if (ST->hasNEON()) 8841 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8842 else if (isLegalMVEShuffleOp(PFEntry)) { 8843 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8844 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8845 unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; 8846 unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; 8847 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) 8848 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8849 } 8850 } 8851 } 8852 8853 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 8854 if (EltSize >= 32) { 8855 // Do the expansion with floating-point types, since that is what the VFP 8856 // registers are defined to use, and since i64 is not legal. 8857 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8858 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8859 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 8860 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 8861 SmallVector<SDValue, 8> Ops; 8862 for (unsigned i = 0; i < NumElts; ++i) { 8863 if (ShuffleMask[i] < 0) 8864 Ops.push_back(DAG.getUNDEF(EltVT)); 8865 else 8866 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 8867 ShuffleMask[i] < (int)NumElts ? V1 : V2, 8868 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 8869 dl, MVT::i32))); 8870 } 8871 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8872 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8873 } 8874 8875 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 8876 isReverseMask(ShuffleMask, VT)) 8877 return LowerReverse_VECTOR_SHUFFLE(Op, DAG); 8878 8879 if (ST->hasNEON() && VT == MVT::v8i8) 8880 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 8881 return NewOp; 8882 8883 if (ST->hasMVEIntegerOps()) 8884 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) 8885 return NewOp; 8886 8887 return SDValue(); 8888 } 8889 8890 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8891 const ARMSubtarget *ST) { 8892 EVT VecVT = Op.getOperand(0).getValueType(); 8893 SDLoc dl(Op); 8894 8895 assert(ST->hasMVEIntegerOps() && 8896 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8897 8898 SDValue Conv = 8899 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8900 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8901 unsigned LaneWidth = 8902 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8903 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; 8904 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, 8905 Op.getOperand(1), DAG.getValueType(MVT::i1)); 8906 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, 8907 DAG.getConstant(~Mask, dl, MVT::i32)); 8908 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); 8909 } 8910 8911 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8912 SelectionDAG &DAG) const { 8913 // INSERT_VECTOR_ELT is legal only for immediate indexes. 8914 SDValue Lane = Op.getOperand(2); 8915 if (!isa<ConstantSDNode>(Lane)) 8916 return SDValue(); 8917 8918 SDValue Elt = Op.getOperand(1); 8919 EVT EltVT = Elt.getValueType(); 8920 8921 if (Subtarget->hasMVEIntegerOps() && 8922 Op.getValueType().getScalarSizeInBits() == 1) 8923 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); 8924 8925 if (getTypeAction(*DAG.getContext(), EltVT) == 8926 TargetLowering::TypePromoteFloat) { 8927 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, 8928 // but the type system will try to do that if we don't intervene. 8929 // Reinterpret any such vector-element insertion as one with the 8930 // corresponding integer types. 8931 8932 SDLoc dl(Op); 8933 8934 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); 8935 assert(getTypeAction(*DAG.getContext(), IEltVT) != 8936 TargetLowering::TypePromoteFloat); 8937 8938 SDValue VecIn = Op.getOperand(0); 8939 EVT VecVT = VecIn.getValueType(); 8940 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, 8941 VecVT.getVectorNumElements()); 8942 8943 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); 8944 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); 8945 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, 8946 IVecIn, IElt, Lane); 8947 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); 8948 } 8949 8950 return Op; 8951 } 8952 8953 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8954 const ARMSubtarget *ST) { 8955 EVT VecVT = Op.getOperand(0).getValueType(); 8956 SDLoc dl(Op); 8957 8958 assert(ST->hasMVEIntegerOps() && 8959 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8960 8961 SDValue Conv = 8962 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8963 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8964 unsigned LaneWidth = 8965 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8966 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, 8967 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); 8968 return Shift; 8969 } 8970 8971 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, 8972 const ARMSubtarget *ST) { 8973 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 8974 SDValue Lane = Op.getOperand(1); 8975 if (!isa<ConstantSDNode>(Lane)) 8976 return SDValue(); 8977 8978 SDValue Vec = Op.getOperand(0); 8979 EVT VT = Vec.getValueType(); 8980 8981 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8982 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); 8983 8984 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 8985 SDLoc dl(Op); 8986 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 8987 } 8988 8989 return Op; 8990 } 8991 8992 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, 8993 const ARMSubtarget *ST) { 8994 SDLoc dl(Op); 8995 assert(Op.getValueType().getScalarSizeInBits() == 1 && 8996 "Unexpected custom CONCAT_VECTORS lowering"); 8997 assert(isPowerOf2_32(Op.getNumOperands()) && 8998 "Unexpected custom CONCAT_VECTORS lowering"); 8999 assert(ST->hasMVEIntegerOps() && 9000 "CONCAT_VECTORS lowering only supported for MVE"); 9001 9002 auto ConcatPair = [&](SDValue V1, SDValue V2) { 9003 EVT Op1VT = V1.getValueType(); 9004 EVT Op2VT = V2.getValueType(); 9005 assert(Op1VT == Op2VT && "Operand types don't match!"); 9006 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext()); 9007 9008 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 9009 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); 9010 9011 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets 9012 // promoted to v8i16, etc. 9013 MVT ElType = 9014 getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 9015 unsigned NumElts = 2 * Op1VT.getVectorNumElements(); 9016 9017 // Extract the vector elements from Op1 and Op2 one by one and truncate them 9018 // to be the right size for the destination. For example, if Op1 is v4i1 9019 // then the promoted vector is v4i32. The result of concatentation gives a 9020 // v8i1, which when promoted is v8i16. That means each i32 element from Op1 9021 // needs truncating to i16 and inserting in the result. 9022 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); 9023 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); 9024 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { 9025 EVT NewVT = NewV.getValueType(); 9026 EVT ConcatVT = ConVec.getValueType(); 9027 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { 9028 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, 9029 DAG.getIntPtrConstant(i, dl)); 9030 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, 9031 DAG.getConstant(j, dl, MVT::i32)); 9032 } 9033 return ConVec; 9034 }; 9035 unsigned j = 0; 9036 ConVec = ExtractInto(NewV1, ConVec, j); 9037 ConVec = ExtractInto(NewV2, ConVec, j); 9038 9039 // Now return the result of comparing the subvector with zero, which will 9040 // generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 we 9041 // convert to a v4i1 compare to fill in the two halves of the i64 as i32s. 9042 if (VT == MVT::v2i1) { 9043 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, ConVec); 9044 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC, 9045 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 9046 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); 9047 } 9048 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, 9049 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 9050 }; 9051 9052 // Concat each pair of subvectors and pack into the lower half of the array. 9053 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end()); 9054 while (ConcatOps.size() > 1) { 9055 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { 9056 SDValue V1 = ConcatOps[I]; 9057 SDValue V2 = ConcatOps[I + 1]; 9058 ConcatOps[I / 2] = ConcatPair(V1, V2); 9059 } 9060 ConcatOps.resize(ConcatOps.size() / 2); 9061 } 9062 return ConcatOps[0]; 9063 } 9064 9065 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, 9066 const ARMSubtarget *ST) { 9067 EVT VT = Op->getValueType(0); 9068 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 9069 return LowerCONCAT_VECTORS_i1(Op, DAG, ST); 9070 9071 // The only time a CONCAT_VECTORS operation can have legal types is when 9072 // two 64-bit vectors are concatenated to a 128-bit vector. 9073 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 9074 "unexpected CONCAT_VECTORS"); 9075 SDLoc dl(Op); 9076 SDValue Val = DAG.getUNDEF(MVT::v2f64); 9077 SDValue Op0 = Op.getOperand(0); 9078 SDValue Op1 = Op.getOperand(1); 9079 if (!Op0.isUndef()) 9080 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 9081 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 9082 DAG.getIntPtrConstant(0, dl)); 9083 if (!Op1.isUndef()) 9084 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 9085 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 9086 DAG.getIntPtrConstant(1, dl)); 9087 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 9088 } 9089 9090 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, 9091 const ARMSubtarget *ST) { 9092 SDValue V1 = Op.getOperand(0); 9093 SDValue V2 = Op.getOperand(1); 9094 SDLoc dl(Op); 9095 EVT VT = Op.getValueType(); 9096 EVT Op1VT = V1.getValueType(); 9097 unsigned NumElts = VT.getVectorNumElements(); 9098 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); 9099 9100 assert(VT.getScalarSizeInBits() == 1 && 9101 "Unexpected custom EXTRACT_SUBVECTOR lowering"); 9102 assert(ST->hasMVEIntegerOps() && 9103 "EXTRACT_SUBVECTOR lowering only supported for MVE"); 9104 9105 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 9106 9107 // We now have Op1 promoted to a vector of integers, where v8i1 gets 9108 // promoted to v8i16, etc. 9109 9110 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 9111 9112 if (NumElts == 2) { 9113 EVT SubVT = MVT::v4i32; 9114 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 9115 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) { 9116 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 9117 DAG.getIntPtrConstant(i, dl)); 9118 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 9119 DAG.getConstant(j, dl, MVT::i32)); 9120 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 9121 DAG.getConstant(j + 1, dl, MVT::i32)); 9122 } 9123 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec, 9124 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 9125 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); 9126 } 9127 9128 EVT SubVT = MVT::getVectorVT(ElType, NumElts); 9129 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 9130 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { 9131 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 9132 DAG.getIntPtrConstant(i, dl)); 9133 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 9134 DAG.getConstant(j, dl, MVT::i32)); 9135 } 9136 9137 // Now return the result of comparing the subvector with zero, 9138 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 9139 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, 9140 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 9141 } 9142 9143 // Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0). 9144 static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, 9145 const ARMSubtarget *ST) { 9146 assert(ST->hasMVEIntegerOps() && "Expected MVE!"); 9147 EVT VT = N->getValueType(0); 9148 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) && 9149 "Expected a vector i1 type!"); 9150 SDValue Op = N->getOperand(0); 9151 EVT FromVT = Op.getValueType(); 9152 SDLoc DL(N); 9153 9154 SDValue And = 9155 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT)); 9156 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT), 9157 DAG.getCondCode(ISD::SETNE)); 9158 } 9159 9160 static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, 9161 const ARMSubtarget *Subtarget) { 9162 if (!Subtarget->hasMVEIntegerOps()) 9163 return SDValue(); 9164 9165 EVT ToVT = N->getValueType(0); 9166 if (ToVT.getScalarType() == MVT::i1) 9167 return LowerTruncatei1(N, DAG, Subtarget); 9168 9169 // MVE does not have a single instruction to perform the truncation of a v4i32 9170 // into the lower half of a v8i16, in the same way that a NEON vmovn would. 9171 // Most of the instructions in MVE follow the 'Beats' system, where moving 9172 // values from different lanes is usually something that the instructions 9173 // avoid. 9174 // 9175 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B, 9176 // which take a the top/bottom half of a larger lane and extend it (or do the 9177 // opposite, truncating into the top/bottom lane from a larger lane). Note 9178 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the 9179 // bottom 16bits from each vector lane. This works really well with T/B 9180 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need 9181 // to move order. 9182 // 9183 // But truncates and sext/zext are always going to be fairly common from llvm. 9184 // We have several options for how to deal with them: 9185 // - Wherever possible combine them into an instruction that makes them 9186 // "free". This includes loads/stores, which can perform the trunc as part 9187 // of the memory operation. Or certain shuffles that can be turned into 9188 // VMOVN/VMOVL. 9189 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So 9190 // trunc(mul(sext(a), sext(b))) may become 9191 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in 9192 // this case can use VMULL). This is performed in the 9193 // MVELaneInterleavingPass. 9194 // - Otherwise we have an option. By default we would expand the 9195 // zext/sext/trunc into a series of lane extract/inserts going via GPR 9196 // registers. One for each vector lane in the vector. This can obviously be 9197 // very expensive. 9198 // - The other option is to use the fact that loads/store can extend/truncate 9199 // to turn a trunc into two truncating stack stores and a stack reload. This 9200 // becomes 3 back-to-back memory operations, but at least that is less than 9201 // all the insert/extracts. 9202 // 9203 // In order to do the last, we convert certain trunc's into MVETRUNC, which 9204 // are either optimized where they can be, or eventually lowered into stack 9205 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores 9206 // two early, where other instructions would be better, and stops us from 9207 // having to reconstruct multiple buildvector shuffles into loads/stores. 9208 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8) 9209 return SDValue(); 9210 EVT FromVT = N->getOperand(0).getValueType(); 9211 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16) 9212 return SDValue(); 9213 9214 SDValue Lo, Hi; 9215 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); 9216 SDLoc DL(N); 9217 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi); 9218 } 9219 9220 static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, 9221 const ARMSubtarget *Subtarget) { 9222 if (!Subtarget->hasMVEIntegerOps()) 9223 return SDValue(); 9224 9225 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC. 9226 9227 EVT ToVT = N->getValueType(0); 9228 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16) 9229 return SDValue(); 9230 SDValue Op = N->getOperand(0); 9231 EVT FromVT = Op.getValueType(); 9232 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8) 9233 return SDValue(); 9234 9235 SDLoc DL(N); 9236 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); 9237 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) 9238 ExtVT = MVT::v8i16; 9239 9240 unsigned Opcode = 9241 N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT; 9242 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op); 9243 SDValue Ext1 = Ext.getValue(1); 9244 9245 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) { 9246 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext); 9247 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1); 9248 } 9249 9250 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1); 9251 } 9252 9253 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 9254 /// element has been zero/sign-extended, depending on the isSigned parameter, 9255 /// from an integer type half its size. 9256 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 9257 bool isSigned) { 9258 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 9259 EVT VT = N->getValueType(0); 9260 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 9261 SDNode *BVN = N->getOperand(0).getNode(); 9262 if (BVN->getValueType(0) != MVT::v4i32 || 9263 BVN->getOpcode() != ISD::BUILD_VECTOR) 9264 return false; 9265 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 9266 unsigned HiElt = 1 - LoElt; 9267 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 9268 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 9269 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 9270 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 9271 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 9272 return false; 9273 if (isSigned) { 9274 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 9275 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 9276 return true; 9277 } else { 9278 if (Hi0->isZero() && Hi1->isZero()) 9279 return true; 9280 } 9281 return false; 9282 } 9283 9284 if (N->getOpcode() != ISD::BUILD_VECTOR) 9285 return false; 9286 9287 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 9288 SDNode *Elt = N->getOperand(i).getNode(); 9289 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 9290 unsigned EltSize = VT.getScalarSizeInBits(); 9291 unsigned HalfSize = EltSize / 2; 9292 if (isSigned) { 9293 if (!isIntN(HalfSize, C->getSExtValue())) 9294 return false; 9295 } else { 9296 if (!isUIntN(HalfSize, C->getZExtValue())) 9297 return false; 9298 } 9299 continue; 9300 } 9301 return false; 9302 } 9303 9304 return true; 9305 } 9306 9307 /// isSignExtended - Check if a node is a vector value that is sign-extended 9308 /// or a constant BUILD_VECTOR with sign-extended elements. 9309 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 9310 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 9311 return true; 9312 if (isExtendedBUILD_VECTOR(N, DAG, true)) 9313 return true; 9314 return false; 9315 } 9316 9317 /// isZeroExtended - Check if a node is a vector value that is zero-extended (or 9318 /// any-extended) or a constant BUILD_VECTOR with zero-extended elements. 9319 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 9320 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND || 9321 ISD::isZEXTLoad(N)) 9322 return true; 9323 if (isExtendedBUILD_VECTOR(N, DAG, false)) 9324 return true; 9325 return false; 9326 } 9327 9328 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 9329 if (OrigVT.getSizeInBits() >= 64) 9330 return OrigVT; 9331 9332 assert(OrigVT.isSimple() && "Expecting a simple value type"); 9333 9334 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 9335 switch (OrigSimpleTy) { 9336 default: llvm_unreachable("Unexpected Vector Type"); 9337 case MVT::v2i8: 9338 case MVT::v2i16: 9339 return MVT::v2i32; 9340 case MVT::v4i8: 9341 return MVT::v4i16; 9342 } 9343 } 9344 9345 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 9346 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 9347 /// We insert the required extension here to get the vector to fill a D register. 9348 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 9349 const EVT &OrigTy, 9350 const EVT &ExtTy, 9351 unsigned ExtOpcode) { 9352 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 9353 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 9354 // 64-bits we need to insert a new extension so that it will be 64-bits. 9355 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 9356 if (OrigTy.getSizeInBits() >= 64) 9357 return N; 9358 9359 // Must extend size to at least 64 bits to be used as an operand for VMULL. 9360 EVT NewVT = getExtensionTo64Bits(OrigTy); 9361 9362 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 9363 } 9364 9365 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 9366 /// does not do any sign/zero extension. If the original vector is less 9367 /// than 64 bits, an appropriate extension will be added after the load to 9368 /// reach a total size of 64 bits. We have to add the extension separately 9369 /// because ARM does not have a sign/zero extending load for vectors. 9370 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 9371 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 9372 9373 // The load already has the right type. 9374 if (ExtendedTy == LD->getMemoryVT()) 9375 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 9376 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(), 9377 LD->getMemOperand()->getFlags()); 9378 9379 // We need to create a zextload/sextload. We cannot just create a load 9380 // followed by a zext/zext node because LowerMUL is also run during normal 9381 // operation legalization where we can't create illegal types. 9382 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 9383 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 9384 LD->getMemoryVT(), LD->getAlign(), 9385 LD->getMemOperand()->getFlags()); 9386 } 9387 9388 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 9389 /// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return 9390 /// the unextended value. The unextended vector should be 64 bits so that it can 9391 /// be used as an operand to a VMULL instruction. If the original vector size 9392 /// before extension is less than 64 bits we add a an extension to resize 9393 /// the vector to 64 bits. 9394 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 9395 if (N->getOpcode() == ISD::SIGN_EXTEND || 9396 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) 9397 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 9398 N->getOperand(0)->getValueType(0), 9399 N->getValueType(0), 9400 N->getOpcode()); 9401 9402 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 9403 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 9404 "Expected extending load"); 9405 9406 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 9407 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 9408 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 9409 SDValue extLoad = 9410 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 9411 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 9412 9413 return newLoad; 9414 } 9415 9416 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 9417 // have been legalized as a BITCAST from v4i32. 9418 if (N->getOpcode() == ISD::BITCAST) { 9419 SDNode *BVN = N->getOperand(0).getNode(); 9420 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 9421 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 9422 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 9423 return DAG.getBuildVector( 9424 MVT::v2i32, SDLoc(N), 9425 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 9426 } 9427 // Construct a new BUILD_VECTOR with elements truncated to half the size. 9428 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 9429 EVT VT = N->getValueType(0); 9430 unsigned EltSize = VT.getScalarSizeInBits() / 2; 9431 unsigned NumElts = VT.getVectorNumElements(); 9432 MVT TruncVT = MVT::getIntegerVT(EltSize); 9433 SmallVector<SDValue, 8> Ops; 9434 SDLoc dl(N); 9435 for (unsigned i = 0; i != NumElts; ++i) { 9436 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 9437 const APInt &CInt = C->getAPIntValue(); 9438 // Element types smaller than 32 bits are not legal, so use i32 elements. 9439 // The values are implicitly truncated so sext vs. zext doesn't matter. 9440 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 9441 } 9442 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 9443 } 9444 9445 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 9446 unsigned Opcode = N->getOpcode(); 9447 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 9448 SDNode *N0 = N->getOperand(0).getNode(); 9449 SDNode *N1 = N->getOperand(1).getNode(); 9450 return N0->hasOneUse() && N1->hasOneUse() && 9451 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 9452 } 9453 return false; 9454 } 9455 9456 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 9457 unsigned Opcode = N->getOpcode(); 9458 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 9459 SDNode *N0 = N->getOperand(0).getNode(); 9460 SDNode *N1 = N->getOperand(1).getNode(); 9461 return N0->hasOneUse() && N1->hasOneUse() && 9462 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 9463 } 9464 return false; 9465 } 9466 9467 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 9468 // Multiplications are only custom-lowered for 128-bit vectors so that 9469 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 9470 EVT VT = Op.getValueType(); 9471 assert(VT.is128BitVector() && VT.isInteger() && 9472 "unexpected type for custom-lowering ISD::MUL"); 9473 SDNode *N0 = Op.getOperand(0).getNode(); 9474 SDNode *N1 = Op.getOperand(1).getNode(); 9475 unsigned NewOpc = 0; 9476 bool isMLA = false; 9477 bool isN0SExt = isSignExtended(N0, DAG); 9478 bool isN1SExt = isSignExtended(N1, DAG); 9479 if (isN0SExt && isN1SExt) 9480 NewOpc = ARMISD::VMULLs; 9481 else { 9482 bool isN0ZExt = isZeroExtended(N0, DAG); 9483 bool isN1ZExt = isZeroExtended(N1, DAG); 9484 if (isN0ZExt && isN1ZExt) 9485 NewOpc = ARMISD::VMULLu; 9486 else if (isN1SExt || isN1ZExt) { 9487 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 9488 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 9489 if (isN1SExt && isAddSubSExt(N0, DAG)) { 9490 NewOpc = ARMISD::VMULLs; 9491 isMLA = true; 9492 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 9493 NewOpc = ARMISD::VMULLu; 9494 isMLA = true; 9495 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 9496 std::swap(N0, N1); 9497 NewOpc = ARMISD::VMULLu; 9498 isMLA = true; 9499 } 9500 } 9501 9502 if (!NewOpc) { 9503 if (VT == MVT::v2i64) 9504 // Fall through to expand this. It is not legal. 9505 return SDValue(); 9506 else 9507 // Other vector multiplications are legal. 9508 return Op; 9509 } 9510 } 9511 9512 // Legalize to a VMULL instruction. 9513 SDLoc DL(Op); 9514 SDValue Op0; 9515 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 9516 if (!isMLA) { 9517 Op0 = SkipExtensionForVMULL(N0, DAG); 9518 assert(Op0.getValueType().is64BitVector() && 9519 Op1.getValueType().is64BitVector() && 9520 "unexpected types for extended operands to VMULL"); 9521 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 9522 } 9523 9524 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 9525 // isel lowering to take advantage of no-stall back to back vmul + vmla. 9526 // vmull q0, d4, d6 9527 // vmlal q0, d5, d6 9528 // is faster than 9529 // vaddl q0, d4, d5 9530 // vmovl q1, d6 9531 // vmul q0, q0, q1 9532 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 9533 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 9534 EVT Op1VT = Op1.getValueType(); 9535 return DAG.getNode(N0->getOpcode(), DL, VT, 9536 DAG.getNode(NewOpc, DL, VT, 9537 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 9538 DAG.getNode(NewOpc, DL, VT, 9539 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 9540 } 9541 9542 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 9543 SelectionDAG &DAG) { 9544 // TODO: Should this propagate fast-math-flags? 9545 9546 // Convert to float 9547 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 9548 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 9549 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 9550 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 9551 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 9552 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 9553 // Get reciprocal estimate. 9554 // float4 recip = vrecpeq_f32(yf); 9555 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9556 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9557 Y); 9558 // Because char has a smaller range than uchar, we can actually get away 9559 // without any newton steps. This requires that we use a weird bias 9560 // of 0xb000, however (again, this has been exhaustively tested). 9561 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 9562 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 9563 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 9564 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 9565 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 9566 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 9567 // Convert back to short. 9568 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 9569 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 9570 return X; 9571 } 9572 9573 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 9574 SelectionDAG &DAG) { 9575 // TODO: Should this propagate fast-math-flags? 9576 9577 SDValue N2; 9578 // Convert to float. 9579 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 9580 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 9581 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 9582 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 9583 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 9584 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 9585 9586 // Use reciprocal estimate and one refinement step. 9587 // float4 recip = vrecpeq_f32(yf); 9588 // recip *= vrecpsq_f32(yf, recip); 9589 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9590 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9591 N1); 9592 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9593 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9594 N1, N2); 9595 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9596 // Because short has a smaller range than ushort, we can actually get away 9597 // with only a single newton step. This requires that we use a weird bias 9598 // of 89, however (again, this has been exhaustively tested). 9599 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 9600 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 9601 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 9602 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 9603 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 9604 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 9605 // Convert back to integer and return. 9606 // return vmovn_s32(vcvt_s32_f32(result)); 9607 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 9608 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 9609 return N0; 9610 } 9611 9612 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, 9613 const ARMSubtarget *ST) { 9614 EVT VT = Op.getValueType(); 9615 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 9616 "unexpected type for custom-lowering ISD::SDIV"); 9617 9618 SDLoc dl(Op); 9619 SDValue N0 = Op.getOperand(0); 9620 SDValue N1 = Op.getOperand(1); 9621 SDValue N2, N3; 9622 9623 if (VT == MVT::v8i8) { 9624 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 9625 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 9626 9627 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9628 DAG.getIntPtrConstant(4, dl)); 9629 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9630 DAG.getIntPtrConstant(4, dl)); 9631 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9632 DAG.getIntPtrConstant(0, dl)); 9633 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9634 DAG.getIntPtrConstant(0, dl)); 9635 9636 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 9637 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 9638 9639 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 9640 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 9641 9642 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 9643 return N0; 9644 } 9645 return LowerSDIV_v4i16(N0, N1, dl, DAG); 9646 } 9647 9648 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, 9649 const ARMSubtarget *ST) { 9650 // TODO: Should this propagate fast-math-flags? 9651 EVT VT = Op.getValueType(); 9652 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 9653 "unexpected type for custom-lowering ISD::UDIV"); 9654 9655 SDLoc dl(Op); 9656 SDValue N0 = Op.getOperand(0); 9657 SDValue N1 = Op.getOperand(1); 9658 SDValue N2, N3; 9659 9660 if (VT == MVT::v8i8) { 9661 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 9662 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 9663 9664 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9665 DAG.getIntPtrConstant(4, dl)); 9666 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9667 DAG.getIntPtrConstant(4, dl)); 9668 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9669 DAG.getIntPtrConstant(0, dl)); 9670 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9671 DAG.getIntPtrConstant(0, dl)); 9672 9673 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 9674 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 9675 9676 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 9677 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 9678 9679 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 9680 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 9681 MVT::i32), 9682 N0); 9683 return N0; 9684 } 9685 9686 // v4i16 sdiv ... Convert to float. 9687 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 9688 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 9689 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 9690 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 9691 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 9692 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 9693 9694 // Use reciprocal estimate and two refinement steps. 9695 // float4 recip = vrecpeq_f32(yf); 9696 // recip *= vrecpsq_f32(yf, recip); 9697 // recip *= vrecpsq_f32(yf, recip); 9698 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9699 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9700 BN1); 9701 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9702 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9703 BN1, N2); 9704 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9705 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9706 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9707 BN1, N2); 9708 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9709 // Simply multiplying by the reciprocal estimate can leave us a few ulps 9710 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 9711 // and that it will never cause us to return an answer too large). 9712 // float4 result = as_float4(as_int4(xf*recip) + 2); 9713 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 9714 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 9715 N1 = DAG.getConstant(2, dl, MVT::v4i32); 9716 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 9717 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 9718 // Convert back to integer and return. 9719 // return vmovn_u32(vcvt_s32_f32(result)); 9720 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 9721 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 9722 return N0; 9723 } 9724 9725 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 9726 SDNode *N = Op.getNode(); 9727 EVT VT = N->getValueType(0); 9728 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9729 9730 SDValue Carry = Op.getOperand(2); 9731 9732 SDLoc DL(Op); 9733 9734 SDValue Result; 9735 if (Op.getOpcode() == ISD::ADDCARRY) { 9736 // This converts the boolean value carry into the carry flag. 9737 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 9738 9739 // Do the addition proper using the carry flag we wanted. 9740 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 9741 Op.getOperand(1), Carry); 9742 9743 // Now convert the carry flag into a boolean value. 9744 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 9745 } else { 9746 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 9747 // have to invert the carry first. 9748 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 9749 DAG.getConstant(1, DL, MVT::i32), Carry); 9750 // This converts the boolean value carry into the carry flag. 9751 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 9752 9753 // Do the subtraction proper using the carry flag we wanted. 9754 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 9755 Op.getOperand(1), Carry); 9756 9757 // Now convert the carry flag into a boolean value. 9758 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 9759 // But the carry returned by ARMISD::SUBE is not a borrow as expected 9760 // by ISD::SUBCARRY, so compute 1 - C. 9761 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 9762 DAG.getConstant(1, DL, MVT::i32), Carry); 9763 } 9764 9765 // Return both values. 9766 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 9767 } 9768 9769 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 9770 assert(Subtarget->isTargetDarwin()); 9771 9772 // For iOS, we want to call an alternative entry point: __sincos_stret, 9773 // return values are passed via sret. 9774 SDLoc dl(Op); 9775 SDValue Arg = Op.getOperand(0); 9776 EVT ArgVT = Arg.getValueType(); 9777 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 9778 auto PtrVT = getPointerTy(DAG.getDataLayout()); 9779 9780 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9781 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9782 9783 // Pair of floats / doubles used to pass the result. 9784 Type *RetTy = StructType::get(ArgTy, ArgTy); 9785 auto &DL = DAG.getDataLayout(); 9786 9787 ArgListTy Args; 9788 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 9789 SDValue SRet; 9790 if (ShouldUseSRet) { 9791 // Create stack object for sret. 9792 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 9793 const Align StackAlign = DL.getPrefTypeAlign(RetTy); 9794 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 9795 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 9796 9797 ArgListEntry Entry; 9798 Entry.Node = SRet; 9799 Entry.Ty = RetTy->getPointerTo(); 9800 Entry.IsSExt = false; 9801 Entry.IsZExt = false; 9802 Entry.IsSRet = true; 9803 Args.push_back(Entry); 9804 RetTy = Type::getVoidTy(*DAG.getContext()); 9805 } 9806 9807 ArgListEntry Entry; 9808 Entry.Node = Arg; 9809 Entry.Ty = ArgTy; 9810 Entry.IsSExt = false; 9811 Entry.IsZExt = false; 9812 Args.push_back(Entry); 9813 9814 RTLIB::Libcall LC = 9815 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 9816 const char *LibcallName = getLibcallName(LC); 9817 CallingConv::ID CC = getLibcallCallingConv(LC); 9818 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 9819 9820 TargetLowering::CallLoweringInfo CLI(DAG); 9821 CLI.setDebugLoc(dl) 9822 .setChain(DAG.getEntryNode()) 9823 .setCallee(CC, RetTy, Callee, std::move(Args)) 9824 .setDiscardResult(ShouldUseSRet); 9825 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 9826 9827 if (!ShouldUseSRet) 9828 return CallResult.first; 9829 9830 SDValue LoadSin = 9831 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 9832 9833 // Address of cos field. 9834 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 9835 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 9836 SDValue LoadCos = 9837 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 9838 9839 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 9840 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 9841 LoadSin.getValue(0), LoadCos.getValue(0)); 9842 } 9843 9844 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 9845 bool Signed, 9846 SDValue &Chain) const { 9847 EVT VT = Op.getValueType(); 9848 assert((VT == MVT::i32 || VT == MVT::i64) && 9849 "unexpected type for custom lowering DIV"); 9850 SDLoc dl(Op); 9851 9852 const auto &DL = DAG.getDataLayout(); 9853 const auto &TLI = DAG.getTargetLoweringInfo(); 9854 9855 const char *Name = nullptr; 9856 if (Signed) 9857 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 9858 else 9859 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 9860 9861 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 9862 9863 ARMTargetLowering::ArgListTy Args; 9864 9865 for (auto AI : {1, 0}) { 9866 ArgListEntry Arg; 9867 Arg.Node = Op.getOperand(AI); 9868 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 9869 Args.push_back(Arg); 9870 } 9871 9872 CallLoweringInfo CLI(DAG); 9873 CLI.setDebugLoc(dl) 9874 .setChain(Chain) 9875 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 9876 ES, std::move(Args)); 9877 9878 return LowerCallTo(CLI).first; 9879 } 9880 9881 // This is a code size optimisation: return the original SDIV node to 9882 // DAGCombiner when we don't want to expand SDIV into a sequence of 9883 // instructions, and an empty node otherwise which will cause the 9884 // SDIV to be expanded in DAGCombine. 9885 SDValue 9886 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 9887 SelectionDAG &DAG, 9888 SmallVectorImpl<SDNode *> &Created) const { 9889 // TODO: Support SREM 9890 if (N->getOpcode() != ISD::SDIV) 9891 return SDValue(); 9892 9893 const auto &ST = DAG.getSubtarget<ARMSubtarget>(); 9894 const bool MinSize = ST.hasMinSize(); 9895 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 9896 : ST.hasDivideInARMMode(); 9897 9898 // Don't touch vector types; rewriting this may lead to scalarizing 9899 // the int divs. 9900 if (N->getOperand(0).getValueType().isVector()) 9901 return SDValue(); 9902 9903 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 9904 // hwdiv support for this to be really profitable. 9905 if (!(MinSize && HasDivide)) 9906 return SDValue(); 9907 9908 // ARM mode is a bit simpler than Thumb: we can handle large power 9909 // of 2 immediates with 1 mov instruction; no further checks required, 9910 // just return the sdiv node. 9911 if (!ST.isThumb()) 9912 return SDValue(N, 0); 9913 9914 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 9915 // and thus lose the code size benefits of a MOVS that requires only 2. 9916 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 9917 // but as it's doing exactly this, it's not worth the trouble to get TTI. 9918 if (Divisor.sgt(128)) 9919 return SDValue(); 9920 9921 return SDValue(N, 0); 9922 } 9923 9924 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 9925 bool Signed) const { 9926 assert(Op.getValueType() == MVT::i32 && 9927 "unexpected type for custom lowering DIV"); 9928 SDLoc dl(Op); 9929 9930 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 9931 DAG.getEntryNode(), Op.getOperand(1)); 9932 9933 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9934 } 9935 9936 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 9937 SDLoc DL(N); 9938 SDValue Op = N->getOperand(1); 9939 if (N->getValueType(0) == MVT::i32) 9940 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 9941 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9942 DAG.getConstant(0, DL, MVT::i32)); 9943 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9944 DAG.getConstant(1, DL, MVT::i32)); 9945 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 9946 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 9947 } 9948 9949 void ARMTargetLowering::ExpandDIV_Windows( 9950 SDValue Op, SelectionDAG &DAG, bool Signed, 9951 SmallVectorImpl<SDValue> &Results) const { 9952 const auto &DL = DAG.getDataLayout(); 9953 const auto &TLI = DAG.getTargetLoweringInfo(); 9954 9955 assert(Op.getValueType() == MVT::i64 && 9956 "unexpected type for custom lowering DIV"); 9957 SDLoc dl(Op); 9958 9959 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 9960 9961 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9962 9963 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 9964 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 9965 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 9966 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 9967 9968 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper)); 9969 } 9970 9971 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { 9972 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); 9973 EVT MemVT = LD->getMemoryVT(); 9974 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 9975 MemVT == MVT::v16i1) && 9976 "Expected a predicate type!"); 9977 assert(MemVT == Op.getValueType()); 9978 assert(LD->getExtensionType() == ISD::NON_EXTLOAD && 9979 "Expected a non-extending load"); 9980 assert(LD->isUnindexed() && "Expected a unindexed load"); 9981 9982 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit 9983 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We 9984 // need to make sure that 8/4/2 bits are actually loaded into the correct 9985 // place, which means loading the value and then shuffling the values into 9986 // the bottom bits of the predicate. 9987 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect 9988 // for BE). 9989 // Speaking of BE, apparently the rest of llvm will assume a reverse order to 9990 // a natural VMSR(load), so needs to be reversed. 9991 9992 SDLoc dl(Op); 9993 SDValue Load = DAG.getExtLoad( 9994 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), 9995 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9996 LD->getMemOperand()); 9997 SDValue Val = Load; 9998 if (DAG.getDataLayout().isBigEndian()) 9999 Val = DAG.getNode(ISD::SRL, dl, MVT::i32, 10000 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load), 10001 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32)); 10002 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val); 10003 if (MemVT != MVT::v16i1) 10004 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, 10005 DAG.getConstant(0, dl, MVT::i32)); 10006 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); 10007 } 10008 10009 void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, 10010 SelectionDAG &DAG) const { 10011 LoadSDNode *LD = cast<LoadSDNode>(N); 10012 EVT MemVT = LD->getMemoryVT(); 10013 assert(LD->isUnindexed() && "Loads should be unindexed at this point."); 10014 10015 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 10016 !Subtarget->isThumb1Only() && LD->isVolatile()) { 10017 SDLoc dl(N); 10018 SDValue Result = DAG.getMemIntrinsicNode( 10019 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), 10020 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); 10021 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1); 10022 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0); 10023 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 10024 Results.append({Pair, Result.getValue(2)}); 10025 } 10026 } 10027 10028 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { 10029 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 10030 EVT MemVT = ST->getMemoryVT(); 10031 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 10032 MemVT == MVT::v16i1) && 10033 "Expected a predicate type!"); 10034 assert(MemVT == ST->getValue().getValueType()); 10035 assert(!ST->isTruncatingStore() && "Expected a non-extending store"); 10036 assert(ST->isUnindexed() && "Expected a unindexed store"); 10037 10038 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with 10039 // top bits unset and a scalar store. 10040 SDLoc dl(Op); 10041 SDValue Build = ST->getValue(); 10042 if (MemVT != MVT::v16i1) { 10043 SmallVector<SDValue, 16> Ops; 10044 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) { 10045 unsigned Elt = DAG.getDataLayout().isBigEndian() 10046 ? MemVT.getVectorNumElements() - I - 1 10047 : I; 10048 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, 10049 DAG.getConstant(Elt, dl, MVT::i32))); 10050 } 10051 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) 10052 Ops.push_back(DAG.getUNDEF(MVT::i32)); 10053 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); 10054 } 10055 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); 10056 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian()) 10057 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32, 10058 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP), 10059 DAG.getConstant(16, dl, MVT::i32)); 10060 return DAG.getTruncStore( 10061 ST->getChain(), dl, GRP, ST->getBasePtr(), 10062 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 10063 ST->getMemOperand()); 10064 } 10065 10066 static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, 10067 const ARMSubtarget *Subtarget) { 10068 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 10069 EVT MemVT = ST->getMemoryVT(); 10070 assert(ST->isUnindexed() && "Stores should be unindexed at this point."); 10071 10072 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 10073 !Subtarget->isThumb1Only() && ST->isVolatile()) { 10074 SDNode *N = Op.getNode(); 10075 SDLoc dl(N); 10076 10077 SDValue Lo = DAG.getNode( 10078 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 10079 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl, 10080 MVT::i32)); 10081 SDValue Hi = DAG.getNode( 10082 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 10083 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl, 10084 MVT::i32)); 10085 10086 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), 10087 {ST->getChain(), Lo, Hi, ST->getBasePtr()}, 10088 MemVT, ST->getMemOperand()); 10089 } else if (Subtarget->hasMVEIntegerOps() && 10090 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 10091 MemVT == MVT::v16i1))) { 10092 return LowerPredicateStore(Op, DAG); 10093 } 10094 10095 return SDValue(); 10096 } 10097 10098 static bool isZeroVector(SDValue N) { 10099 return (ISD::isBuildVectorAllZeros(N.getNode()) || 10100 (N->getOpcode() == ARMISD::VMOVIMM && 10101 isNullConstant(N->getOperand(0)))); 10102 } 10103 10104 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { 10105 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 10106 MVT VT = Op.getSimpleValueType(); 10107 SDValue Mask = N->getMask(); 10108 SDValue PassThru = N->getPassThru(); 10109 SDLoc dl(Op); 10110 10111 if (isZeroVector(PassThru)) 10112 return Op; 10113 10114 // MVE Masked loads use zero as the passthru value. Here we convert undef to 10115 // zero too, and other values are lowered to a select. 10116 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 10117 DAG.getTargetConstant(0, dl, MVT::i32)); 10118 SDValue NewLoad = DAG.getMaskedLoad( 10119 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, 10120 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), 10121 N->getExtensionType(), N->isExpandingLoad()); 10122 SDValue Combo = NewLoad; 10123 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST || 10124 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) && 10125 isZeroVector(PassThru->getOperand(0)); 10126 if (!PassThru.isUndef() && !PassThruIsCastZero) 10127 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); 10128 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); 10129 } 10130 10131 static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, 10132 const ARMSubtarget *ST) { 10133 if (!ST->hasMVEIntegerOps()) 10134 return SDValue(); 10135 10136 SDLoc dl(Op); 10137 unsigned BaseOpcode = 0; 10138 switch (Op->getOpcode()) { 10139 default: llvm_unreachable("Expected VECREDUCE opcode"); 10140 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break; 10141 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break; 10142 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break; 10143 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break; 10144 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break; 10145 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break; 10146 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; 10147 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; 10148 } 10149 10150 SDValue Op0 = Op->getOperand(0); 10151 EVT VT = Op0.getValueType(); 10152 EVT EltVT = VT.getVectorElementType(); 10153 unsigned NumElts = VT.getVectorNumElements(); 10154 unsigned NumActiveLanes = NumElts; 10155 10156 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 || 10157 NumActiveLanes == 2) && 10158 "Only expected a power 2 vector size"); 10159 10160 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements 10161 // allows us to easily extract vector elements from the lanes. 10162 while (NumActiveLanes > 4) { 10163 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32; 10164 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0); 10165 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev); 10166 NumActiveLanes /= 2; 10167 } 10168 10169 SDValue Res; 10170 if (NumActiveLanes == 4) { 10171 // The remaining 4 elements are summed sequentially 10172 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10173 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32)); 10174 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10175 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32)); 10176 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10177 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32)); 10178 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10179 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32)); 10180 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); 10181 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags()); 10182 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags()); 10183 } else { 10184 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10185 DAG.getConstant(0, dl, MVT::i32)); 10186 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 10187 DAG.getConstant(1, dl, MVT::i32)); 10188 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); 10189 } 10190 10191 // Result type may be wider than element type. 10192 if (EltVT != Op->getValueType(0)) 10193 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res); 10194 return Res; 10195 } 10196 10197 static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, 10198 const ARMSubtarget *ST) { 10199 if (!ST->hasMVEFloatOps()) 10200 return SDValue(); 10201 return LowerVecReduce(Op, DAG, ST); 10202 } 10203 10204 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 10205 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering())) 10206 // Acquire/Release load/store is not legal for targets without a dmb or 10207 // equivalent available. 10208 return SDValue(); 10209 10210 // Monotonic load/store is legal for all targets. 10211 return Op; 10212 } 10213 10214 static void ReplaceREADCYCLECOUNTER(SDNode *N, 10215 SmallVectorImpl<SDValue> &Results, 10216 SelectionDAG &DAG, 10217 const ARMSubtarget *Subtarget) { 10218 SDLoc DL(N); 10219 // Under Power Management extensions, the cycle-count is: 10220 // mrc p15, #0, <Rt>, c9, c13, #0 10221 SDValue Ops[] = { N->getOperand(0), // Chain 10222 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 10223 DAG.getTargetConstant(15, DL, MVT::i32), 10224 DAG.getTargetConstant(0, DL, MVT::i32), 10225 DAG.getTargetConstant(9, DL, MVT::i32), 10226 DAG.getTargetConstant(13, DL, MVT::i32), 10227 DAG.getTargetConstant(0, DL, MVT::i32) 10228 }; 10229 10230 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 10231 DAG.getVTList(MVT::i32, MVT::Other), Ops); 10232 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 10233 DAG.getConstant(0, DL, MVT::i32))); 10234 Results.push_back(Cycles32.getValue(1)); 10235 } 10236 10237 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 10238 SDLoc dl(V.getNode()); 10239 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 10240 SDValue VHi = DAG.getAnyExtOrTrunc( 10241 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 10242 dl, MVT::i32); 10243 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 10244 if (isBigEndian) 10245 std::swap (VLo, VHi); 10246 SDValue RegClass = 10247 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 10248 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 10249 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 10250 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 10251 return SDValue( 10252 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 10253 } 10254 10255 static void ReplaceCMP_SWAP_64Results(SDNode *N, 10256 SmallVectorImpl<SDValue> & Results, 10257 SelectionDAG &DAG) { 10258 assert(N->getValueType(0) == MVT::i64 && 10259 "AtomicCmpSwap on types less than 64 should be legal"); 10260 SDValue Ops[] = {N->getOperand(1), 10261 createGPRPairNode(DAG, N->getOperand(2)), 10262 createGPRPairNode(DAG, N->getOperand(3)), 10263 N->getOperand(0)}; 10264 SDNode *CmpSwap = DAG.getMachineNode( 10265 ARM::CMP_SWAP_64, SDLoc(N), 10266 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 10267 10268 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 10269 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 10270 10271 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 10272 10273 SDValue Lo = 10274 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 10275 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 10276 SDValue Hi = 10277 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 10278 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 10279 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi)); 10280 Results.push_back(SDValue(CmpSwap, 2)); 10281 } 10282 10283 SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const { 10284 SDLoc dl(Op); 10285 EVT VT = Op.getValueType(); 10286 SDValue Chain = Op.getOperand(0); 10287 SDValue LHS = Op.getOperand(1); 10288 SDValue RHS = Op.getOperand(2); 10289 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get(); 10290 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 10291 10292 // If we don't have instructions of this float type then soften to a libcall 10293 // and use SETCC instead. 10294 if (isUnsupportedFloatingType(LHS.getValueType())) { 10295 DAG.getTargetLoweringInfo().softenSetCCOperands( 10296 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling); 10297 if (!RHS.getNode()) { 10298 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 10299 CC = ISD::SETNE; 10300 } 10301 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS, 10302 DAG.getCondCode(CC)); 10303 return DAG.getMergeValues({Result, Chain}, dl); 10304 } 10305 10306 ARMCC::CondCodes CondCode, CondCode2; 10307 FPCCToARMCC(CC, CondCode, CondCode2); 10308 10309 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit 10310 // in CMPFP and CMPFPE, but instead it should be made explicit by these 10311 // instructions using a chain instead of glue. This would also fix the problem 10312 // here (and also in LowerSELECT_CC) where we generate two comparisons when 10313 // CondCode2 != AL. 10314 SDValue True = DAG.getConstant(1, dl, VT); 10315 SDValue False = DAG.getConstant(0, dl, VT); 10316 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 10317 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 10318 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 10319 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG); 10320 if (CondCode2 != ARMCC::AL) { 10321 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 10322 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 10323 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG); 10324 } 10325 return DAG.getMergeValues({Result, Chain}, dl); 10326 } 10327 10328 SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const { 10329 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 10330 10331 EVT VT = getPointerTy(DAG.getDataLayout()); 10332 SDLoc DL(Op); 10333 int FI = MFI.CreateFixedObject(4, 0, false); 10334 return DAG.getFrameIndex(FI, VT); 10335 } 10336 10337 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 10338 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 10339 switch (Op.getOpcode()) { 10340 default: llvm_unreachable("Don't know how to custom lower this!"); 10341 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 10342 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 10343 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 10344 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 10345 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 10346 case ISD::SELECT: return LowerSELECT(Op, DAG); 10347 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 10348 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 10349 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 10350 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 10351 case ISD::VASTART: return LowerVASTART(Op, DAG); 10352 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 10353 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 10354 case ISD::SINT_TO_FP: 10355 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 10356 case ISD::STRICT_FP_TO_SINT: 10357 case ISD::STRICT_FP_TO_UINT: 10358 case ISD::FP_TO_SINT: 10359 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 10360 case ISD::FP_TO_SINT_SAT: 10361 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget); 10362 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 10363 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 10364 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 10365 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 10366 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 10367 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 10368 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); 10369 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 10370 Subtarget); 10371 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 10372 case ISD::SHL: 10373 case ISD::SRL: 10374 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 10375 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 10376 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 10377 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 10378 case ISD::SRL_PARTS: 10379 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 10380 case ISD::CTTZ: 10381 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 10382 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 10383 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); 10384 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 10385 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 10386 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 10387 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); 10388 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); 10389 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 10390 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); 10391 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); 10392 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget); 10393 case ISD::SIGN_EXTEND: 10394 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget); 10395 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 10396 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); 10397 case ISD::MUL: return LowerMUL(Op, DAG); 10398 case ISD::SDIV: 10399 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 10400 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 10401 return LowerSDIV(Op, DAG, Subtarget); 10402 case ISD::UDIV: 10403 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 10404 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 10405 return LowerUDIV(Op, DAG, Subtarget); 10406 case ISD::ADDCARRY: 10407 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 10408 case ISD::SADDO: 10409 case ISD::SSUBO: 10410 return LowerSignedALUO(Op, DAG); 10411 case ISD::UADDO: 10412 case ISD::USUBO: 10413 return LowerUnsignedALUO(Op, DAG); 10414 case ISD::SADDSAT: 10415 case ISD::SSUBSAT: 10416 case ISD::UADDSAT: 10417 case ISD::USUBSAT: 10418 return LowerADDSUBSAT(Op, DAG, Subtarget); 10419 case ISD::LOAD: 10420 return LowerPredicateLoad(Op, DAG); 10421 case ISD::STORE: 10422 return LowerSTORE(Op, DAG, Subtarget); 10423 case ISD::MLOAD: 10424 return LowerMLOAD(Op, DAG); 10425 case ISD::VECREDUCE_MUL: 10426 case ISD::VECREDUCE_AND: 10427 case ISD::VECREDUCE_OR: 10428 case ISD::VECREDUCE_XOR: 10429 return LowerVecReduce(Op, DAG, Subtarget); 10430 case ISD::VECREDUCE_FADD: 10431 case ISD::VECREDUCE_FMUL: 10432 case ISD::VECREDUCE_FMIN: 10433 case ISD::VECREDUCE_FMAX: 10434 return LowerVecReduceF(Op, DAG, Subtarget); 10435 case ISD::ATOMIC_LOAD: 10436 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 10437 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 10438 case ISD::SDIVREM: 10439 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 10440 case ISD::DYNAMIC_STACKALLOC: 10441 if (Subtarget->isTargetWindows()) 10442 return LowerDYNAMIC_STACKALLOC(Op, DAG); 10443 llvm_unreachable("Don't know how to custom lower this!"); 10444 case ISD::STRICT_FP_ROUND: 10445 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 10446 case ISD::STRICT_FP_EXTEND: 10447 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 10448 case ISD::STRICT_FSETCC: 10449 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); 10450 case ISD::SPONENTRY: 10451 return LowerSPONENTRY(Op, DAG); 10452 case ARMISD::WIN__DBZCHK: return SDValue(); 10453 } 10454 } 10455 10456 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 10457 SelectionDAG &DAG) { 10458 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 10459 unsigned Opc = 0; 10460 if (IntNo == Intrinsic::arm_smlald) 10461 Opc = ARMISD::SMLALD; 10462 else if (IntNo == Intrinsic::arm_smlaldx) 10463 Opc = ARMISD::SMLALDX; 10464 else if (IntNo == Intrinsic::arm_smlsld) 10465 Opc = ARMISD::SMLSLD; 10466 else if (IntNo == Intrinsic::arm_smlsldx) 10467 Opc = ARMISD::SMLSLDX; 10468 else 10469 return; 10470 10471 SDLoc dl(N); 10472 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10473 N->getOperand(3), 10474 DAG.getConstant(0, dl, MVT::i32)); 10475 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10476 N->getOperand(3), 10477 DAG.getConstant(1, dl, MVT::i32)); 10478 10479 SDValue LongMul = DAG.getNode(Opc, dl, 10480 DAG.getVTList(MVT::i32, MVT::i32), 10481 N->getOperand(1), N->getOperand(2), 10482 Lo, Hi); 10483 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, 10484 LongMul.getValue(0), LongMul.getValue(1))); 10485 } 10486 10487 /// ReplaceNodeResults - Replace the results of node with an illegal result 10488 /// type with new values built out of custom code. 10489 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 10490 SmallVectorImpl<SDValue> &Results, 10491 SelectionDAG &DAG) const { 10492 SDValue Res; 10493 switch (N->getOpcode()) { 10494 default: 10495 llvm_unreachable("Don't know how to custom expand this!"); 10496 case ISD::READ_REGISTER: 10497 ExpandREAD_REGISTER(N, Results, DAG); 10498 break; 10499 case ISD::BITCAST: 10500 Res = ExpandBITCAST(N, DAG, Subtarget); 10501 break; 10502 case ISD::SRL: 10503 case ISD::SRA: 10504 case ISD::SHL: 10505 Res = Expand64BitShift(N, DAG, Subtarget); 10506 break; 10507 case ISD::SREM: 10508 case ISD::UREM: 10509 Res = LowerREM(N, DAG); 10510 break; 10511 case ISD::SDIVREM: 10512 case ISD::UDIVREM: 10513 Res = LowerDivRem(SDValue(N, 0), DAG); 10514 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 10515 Results.push_back(Res.getValue(0)); 10516 Results.push_back(Res.getValue(1)); 10517 return; 10518 case ISD::SADDSAT: 10519 case ISD::SSUBSAT: 10520 case ISD::UADDSAT: 10521 case ISD::USUBSAT: 10522 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget); 10523 break; 10524 case ISD::READCYCLECOUNTER: 10525 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 10526 return; 10527 case ISD::UDIV: 10528 case ISD::SDIV: 10529 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 10530 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 10531 Results); 10532 case ISD::ATOMIC_CMP_SWAP: 10533 ReplaceCMP_SWAP_64Results(N, Results, DAG); 10534 return; 10535 case ISD::INTRINSIC_WO_CHAIN: 10536 return ReplaceLongIntrinsic(N, Results, DAG); 10537 case ISD::LOAD: 10538 LowerLOAD(N, Results, DAG); 10539 break; 10540 case ISD::TRUNCATE: 10541 Res = LowerTruncate(N, DAG, Subtarget); 10542 break; 10543 case ISD::SIGN_EXTEND: 10544 case ISD::ZERO_EXTEND: 10545 Res = LowerVectorExtend(N, DAG, Subtarget); 10546 break; 10547 case ISD::FP_TO_SINT_SAT: 10548 case ISD::FP_TO_UINT_SAT: 10549 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget); 10550 break; 10551 } 10552 if (Res.getNode()) 10553 Results.push_back(Res); 10554 } 10555 10556 //===----------------------------------------------------------------------===// 10557 // ARM Scheduler Hooks 10558 //===----------------------------------------------------------------------===// 10559 10560 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 10561 /// registers the function context. 10562 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 10563 MachineBasicBlock *MBB, 10564 MachineBasicBlock *DispatchBB, 10565 int FI) const { 10566 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 10567 "ROPI/RWPI not currently supported with SjLj"); 10568 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10569 DebugLoc dl = MI.getDebugLoc(); 10570 MachineFunction *MF = MBB->getParent(); 10571 MachineRegisterInfo *MRI = &MF->getRegInfo(); 10572 MachineConstantPool *MCP = MF->getConstantPool(); 10573 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 10574 const Function &F = MF->getFunction(); 10575 10576 bool isThumb = Subtarget->isThumb(); 10577 bool isThumb2 = Subtarget->isThumb2(); 10578 10579 unsigned PCLabelId = AFI->createPICLabelUId(); 10580 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 10581 ARMConstantPoolValue *CPV = 10582 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 10583 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4)); 10584 10585 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 10586 : &ARM::GPRRegClass; 10587 10588 // Grab constant pool and fixed stack memory operands. 10589 MachineMemOperand *CPMMO = 10590 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10591 MachineMemOperand::MOLoad, 4, Align(4)); 10592 10593 MachineMemOperand *FIMMOSt = 10594 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 10595 MachineMemOperand::MOStore, 4, Align(4)); 10596 10597 // Load the address of the dispatch MBB into the jump buffer. 10598 if (isThumb2) { 10599 // Incoming value: jbuf 10600 // ldr.n r5, LCPI1_1 10601 // orr r5, r5, #1 10602 // add r5, pc 10603 // str r5, [$jbuf, #+4] ; &jbuf[1] 10604 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10605 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 10606 .addConstantPoolIndex(CPI) 10607 .addMemOperand(CPMMO) 10608 .add(predOps(ARMCC::AL)); 10609 // Set the low bit because of thumb mode. 10610 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10611 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 10612 .addReg(NewVReg1, RegState::Kill) 10613 .addImm(0x01) 10614 .add(predOps(ARMCC::AL)) 10615 .add(condCodeOp()); 10616 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10617 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 10618 .addReg(NewVReg2, RegState::Kill) 10619 .addImm(PCLabelId); 10620 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 10621 .addReg(NewVReg3, RegState::Kill) 10622 .addFrameIndex(FI) 10623 .addImm(36) // &jbuf[1] :: pc 10624 .addMemOperand(FIMMOSt) 10625 .add(predOps(ARMCC::AL)); 10626 } else if (isThumb) { 10627 // Incoming value: jbuf 10628 // ldr.n r1, LCPI1_4 10629 // add r1, pc 10630 // mov r2, #1 10631 // orrs r1, r2 10632 // add r2, $jbuf, #+4 ; &jbuf[1] 10633 // str r1, [r2] 10634 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10635 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 10636 .addConstantPoolIndex(CPI) 10637 .addMemOperand(CPMMO) 10638 .add(predOps(ARMCC::AL)); 10639 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10640 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 10641 .addReg(NewVReg1, RegState::Kill) 10642 .addImm(PCLabelId); 10643 // Set the low bit because of thumb mode. 10644 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10645 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 10646 .addReg(ARM::CPSR, RegState::Define) 10647 .addImm(1) 10648 .add(predOps(ARMCC::AL)); 10649 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10650 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 10651 .addReg(ARM::CPSR, RegState::Define) 10652 .addReg(NewVReg2, RegState::Kill) 10653 .addReg(NewVReg3, RegState::Kill) 10654 .add(predOps(ARMCC::AL)); 10655 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10656 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 10657 .addFrameIndex(FI) 10658 .addImm(36); // &jbuf[1] :: pc 10659 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 10660 .addReg(NewVReg4, RegState::Kill) 10661 .addReg(NewVReg5, RegState::Kill) 10662 .addImm(0) 10663 .addMemOperand(FIMMOSt) 10664 .add(predOps(ARMCC::AL)); 10665 } else { 10666 // Incoming value: jbuf 10667 // ldr r1, LCPI1_1 10668 // add r1, pc, r1 10669 // str r1, [$jbuf, #+4] ; &jbuf[1] 10670 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10671 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 10672 .addConstantPoolIndex(CPI) 10673 .addImm(0) 10674 .addMemOperand(CPMMO) 10675 .add(predOps(ARMCC::AL)); 10676 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10677 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 10678 .addReg(NewVReg1, RegState::Kill) 10679 .addImm(PCLabelId) 10680 .add(predOps(ARMCC::AL)); 10681 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 10682 .addReg(NewVReg2, RegState::Kill) 10683 .addFrameIndex(FI) 10684 .addImm(36) // &jbuf[1] :: pc 10685 .addMemOperand(FIMMOSt) 10686 .add(predOps(ARMCC::AL)); 10687 } 10688 } 10689 10690 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 10691 MachineBasicBlock *MBB) const { 10692 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10693 DebugLoc dl = MI.getDebugLoc(); 10694 MachineFunction *MF = MBB->getParent(); 10695 MachineRegisterInfo *MRI = &MF->getRegInfo(); 10696 MachineFrameInfo &MFI = MF->getFrameInfo(); 10697 int FI = MFI.getFunctionContextIndex(); 10698 10699 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 10700 : &ARM::GPRnopcRegClass; 10701 10702 // Get a mapping of the call site numbers to all of the landing pads they're 10703 // associated with. 10704 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 10705 unsigned MaxCSNum = 0; 10706 for (MachineBasicBlock &BB : *MF) { 10707 if (!BB.isEHPad()) 10708 continue; 10709 10710 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 10711 // pad. 10712 for (MachineInstr &II : BB) { 10713 if (!II.isEHLabel()) 10714 continue; 10715 10716 MCSymbol *Sym = II.getOperand(0).getMCSymbol(); 10717 if (!MF->hasCallSiteLandingPad(Sym)) continue; 10718 10719 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 10720 for (unsigned Idx : CallSiteIdxs) { 10721 CallSiteNumToLPad[Idx].push_back(&BB); 10722 MaxCSNum = std::max(MaxCSNum, Idx); 10723 } 10724 break; 10725 } 10726 } 10727 10728 // Get an ordered list of the machine basic blocks for the jump table. 10729 std::vector<MachineBasicBlock*> LPadList; 10730 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 10731 LPadList.reserve(CallSiteNumToLPad.size()); 10732 for (unsigned I = 1; I <= MaxCSNum; ++I) { 10733 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 10734 for (MachineBasicBlock *MBB : MBBList) { 10735 LPadList.push_back(MBB); 10736 InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end()); 10737 } 10738 } 10739 10740 assert(!LPadList.empty() && 10741 "No landing pad destinations for the dispatch jump table!"); 10742 10743 // Create the jump table and associated information. 10744 MachineJumpTableInfo *JTI = 10745 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 10746 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 10747 10748 // Create the MBBs for the dispatch code. 10749 10750 // Shove the dispatch's address into the return slot in the function context. 10751 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 10752 DispatchBB->setIsEHPad(); 10753 10754 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 10755 unsigned trap_opcode; 10756 if (Subtarget->isThumb()) 10757 trap_opcode = ARM::tTRAP; 10758 else 10759 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 10760 10761 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 10762 DispatchBB->addSuccessor(TrapBB); 10763 10764 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 10765 DispatchBB->addSuccessor(DispContBB); 10766 10767 // Insert and MBBs. 10768 MF->insert(MF->end(), DispatchBB); 10769 MF->insert(MF->end(), DispContBB); 10770 MF->insert(MF->end(), TrapBB); 10771 10772 // Insert code into the entry block that creates and registers the function 10773 // context. 10774 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 10775 10776 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 10777 MachinePointerInfo::getFixedStack(*MF, FI), 10778 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4)); 10779 10780 MachineInstrBuilder MIB; 10781 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 10782 10783 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 10784 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 10785 10786 // Add a register mask with no preserved registers. This results in all 10787 // registers being marked as clobbered. This can't work if the dispatch block 10788 // is in a Thumb1 function and is linked with ARM code which uses the FP 10789 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 10790 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 10791 10792 bool IsPositionIndependent = isPositionIndependent(); 10793 unsigned NumLPads = LPadList.size(); 10794 if (Subtarget->isThumb2()) { 10795 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10796 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 10797 .addFrameIndex(FI) 10798 .addImm(4) 10799 .addMemOperand(FIMMOLd) 10800 .add(predOps(ARMCC::AL)); 10801 10802 if (NumLPads < 256) { 10803 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 10804 .addReg(NewVReg1) 10805 .addImm(LPadList.size()) 10806 .add(predOps(ARMCC::AL)); 10807 } else { 10808 Register VReg1 = MRI->createVirtualRegister(TRC); 10809 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 10810 .addImm(NumLPads & 0xFFFF) 10811 .add(predOps(ARMCC::AL)); 10812 10813 unsigned VReg2 = VReg1; 10814 if ((NumLPads & 0xFFFF0000) != 0) { 10815 VReg2 = MRI->createVirtualRegister(TRC); 10816 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 10817 .addReg(VReg1) 10818 .addImm(NumLPads >> 16) 10819 .add(predOps(ARMCC::AL)); 10820 } 10821 10822 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 10823 .addReg(NewVReg1) 10824 .addReg(VReg2) 10825 .add(predOps(ARMCC::AL)); 10826 } 10827 10828 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 10829 .addMBB(TrapBB) 10830 .addImm(ARMCC::HI) 10831 .addReg(ARM::CPSR); 10832 10833 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10834 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 10835 .addJumpTableIndex(MJTI) 10836 .add(predOps(ARMCC::AL)); 10837 10838 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10839 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 10840 .addReg(NewVReg3, RegState::Kill) 10841 .addReg(NewVReg1) 10842 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 10843 .add(predOps(ARMCC::AL)) 10844 .add(condCodeOp()); 10845 10846 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 10847 .addReg(NewVReg4, RegState::Kill) 10848 .addReg(NewVReg1) 10849 .addJumpTableIndex(MJTI); 10850 } else if (Subtarget->isThumb()) { 10851 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10852 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 10853 .addFrameIndex(FI) 10854 .addImm(1) 10855 .addMemOperand(FIMMOLd) 10856 .add(predOps(ARMCC::AL)); 10857 10858 if (NumLPads < 256) { 10859 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 10860 .addReg(NewVReg1) 10861 .addImm(NumLPads) 10862 .add(predOps(ARMCC::AL)); 10863 } else { 10864 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10865 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10866 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 10867 10868 // MachineConstantPool wants an explicit alignment. 10869 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10870 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10871 10872 Register VReg1 = MRI->createVirtualRegister(TRC); 10873 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 10874 .addReg(VReg1, RegState::Define) 10875 .addConstantPoolIndex(Idx) 10876 .add(predOps(ARMCC::AL)); 10877 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 10878 .addReg(NewVReg1) 10879 .addReg(VReg1) 10880 .add(predOps(ARMCC::AL)); 10881 } 10882 10883 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 10884 .addMBB(TrapBB) 10885 .addImm(ARMCC::HI) 10886 .addReg(ARM::CPSR); 10887 10888 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10889 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 10890 .addReg(ARM::CPSR, RegState::Define) 10891 .addReg(NewVReg1) 10892 .addImm(2) 10893 .add(predOps(ARMCC::AL)); 10894 10895 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10896 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 10897 .addJumpTableIndex(MJTI) 10898 .add(predOps(ARMCC::AL)); 10899 10900 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10901 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 10902 .addReg(ARM::CPSR, RegState::Define) 10903 .addReg(NewVReg2, RegState::Kill) 10904 .addReg(NewVReg3) 10905 .add(predOps(ARMCC::AL)); 10906 10907 MachineMemOperand *JTMMOLd = 10908 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), 10909 MachineMemOperand::MOLoad, 4, Align(4)); 10910 10911 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10912 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 10913 .addReg(NewVReg4, RegState::Kill) 10914 .addImm(0) 10915 .addMemOperand(JTMMOLd) 10916 .add(predOps(ARMCC::AL)); 10917 10918 unsigned NewVReg6 = NewVReg5; 10919 if (IsPositionIndependent) { 10920 NewVReg6 = MRI->createVirtualRegister(TRC); 10921 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 10922 .addReg(ARM::CPSR, RegState::Define) 10923 .addReg(NewVReg5, RegState::Kill) 10924 .addReg(NewVReg3) 10925 .add(predOps(ARMCC::AL)); 10926 } 10927 10928 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 10929 .addReg(NewVReg6, RegState::Kill) 10930 .addJumpTableIndex(MJTI); 10931 } else { 10932 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10933 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 10934 .addFrameIndex(FI) 10935 .addImm(4) 10936 .addMemOperand(FIMMOLd) 10937 .add(predOps(ARMCC::AL)); 10938 10939 if (NumLPads < 256) { 10940 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 10941 .addReg(NewVReg1) 10942 .addImm(NumLPads) 10943 .add(predOps(ARMCC::AL)); 10944 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 10945 Register VReg1 = MRI->createVirtualRegister(TRC); 10946 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 10947 .addImm(NumLPads & 0xFFFF) 10948 .add(predOps(ARMCC::AL)); 10949 10950 unsigned VReg2 = VReg1; 10951 if ((NumLPads & 0xFFFF0000) != 0) { 10952 VReg2 = MRI->createVirtualRegister(TRC); 10953 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 10954 .addReg(VReg1) 10955 .addImm(NumLPads >> 16) 10956 .add(predOps(ARMCC::AL)); 10957 } 10958 10959 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 10960 .addReg(NewVReg1) 10961 .addReg(VReg2) 10962 .add(predOps(ARMCC::AL)); 10963 } else { 10964 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10965 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10966 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 10967 10968 // MachineConstantPool wants an explicit alignment. 10969 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10970 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10971 10972 Register VReg1 = MRI->createVirtualRegister(TRC); 10973 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 10974 .addReg(VReg1, RegState::Define) 10975 .addConstantPoolIndex(Idx) 10976 .addImm(0) 10977 .add(predOps(ARMCC::AL)); 10978 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 10979 .addReg(NewVReg1) 10980 .addReg(VReg1, RegState::Kill) 10981 .add(predOps(ARMCC::AL)); 10982 } 10983 10984 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 10985 .addMBB(TrapBB) 10986 .addImm(ARMCC::HI) 10987 .addReg(ARM::CPSR); 10988 10989 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10990 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 10991 .addReg(NewVReg1) 10992 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 10993 .add(predOps(ARMCC::AL)) 10994 .add(condCodeOp()); 10995 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10996 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 10997 .addJumpTableIndex(MJTI) 10998 .add(predOps(ARMCC::AL)); 10999 11000 MachineMemOperand *JTMMOLd = 11001 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), 11002 MachineMemOperand::MOLoad, 4, Align(4)); 11003 Register NewVReg5 = MRI->createVirtualRegister(TRC); 11004 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 11005 .addReg(NewVReg3, RegState::Kill) 11006 .addReg(NewVReg4) 11007 .addImm(0) 11008 .addMemOperand(JTMMOLd) 11009 .add(predOps(ARMCC::AL)); 11010 11011 if (IsPositionIndependent) { 11012 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 11013 .addReg(NewVReg5, RegState::Kill) 11014 .addReg(NewVReg4) 11015 .addJumpTableIndex(MJTI); 11016 } else { 11017 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 11018 .addReg(NewVReg5, RegState::Kill) 11019 .addJumpTableIndex(MJTI); 11020 } 11021 } 11022 11023 // Add the jump table entries as successors to the MBB. 11024 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 11025 for (MachineBasicBlock *CurMBB : LPadList) { 11026 if (SeenMBBs.insert(CurMBB).second) 11027 DispContBB->addSuccessor(CurMBB); 11028 } 11029 11030 // N.B. the order the invoke BBs are processed in doesn't matter here. 11031 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 11032 SmallVector<MachineBasicBlock*, 64> MBBLPads; 11033 for (MachineBasicBlock *BB : InvokeBBs) { 11034 11035 // Remove the landing pad successor from the invoke block and replace it 11036 // with the new dispatch block. 11037 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors()); 11038 while (!Successors.empty()) { 11039 MachineBasicBlock *SMBB = Successors.pop_back_val(); 11040 if (SMBB->isEHPad()) { 11041 BB->removeSuccessor(SMBB); 11042 MBBLPads.push_back(SMBB); 11043 } 11044 } 11045 11046 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 11047 BB->normalizeSuccProbs(); 11048 11049 // Find the invoke call and mark all of the callee-saved registers as 11050 // 'implicit defined' so that they're spilled. This prevents code from 11051 // moving instructions to before the EH block, where they will never be 11052 // executed. 11053 for (MachineBasicBlock::reverse_iterator 11054 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 11055 if (!II->isCall()) continue; 11056 11057 DenseMap<unsigned, bool> DefRegs; 11058 for (MachineInstr::mop_iterator 11059 OI = II->operands_begin(), OE = II->operands_end(); 11060 OI != OE; ++OI) { 11061 if (!OI->isReg()) continue; 11062 DefRegs[OI->getReg()] = true; 11063 } 11064 11065 MachineInstrBuilder MIB(*MF, &*II); 11066 11067 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 11068 unsigned Reg = SavedRegs[i]; 11069 if (Subtarget->isThumb2() && 11070 !ARM::tGPRRegClass.contains(Reg) && 11071 !ARM::hGPRRegClass.contains(Reg)) 11072 continue; 11073 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 11074 continue; 11075 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 11076 continue; 11077 if (!DefRegs[Reg]) 11078 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 11079 } 11080 11081 break; 11082 } 11083 } 11084 11085 // Mark all former landing pads as non-landing pads. The dispatch is the only 11086 // landing pad now. 11087 for (MachineBasicBlock *MBBLPad : MBBLPads) 11088 MBBLPad->setIsEHPad(false); 11089 11090 // The instruction is gone now. 11091 MI.eraseFromParent(); 11092 } 11093 11094 static 11095 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 11096 for (MachineBasicBlock *S : MBB->successors()) 11097 if (S != Succ) 11098 return S; 11099 llvm_unreachable("Expecting a BB with two successors!"); 11100 } 11101 11102 /// Return the load opcode for a given load size. If load size >= 8, 11103 /// neon opcode will be returned. 11104 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 11105 if (LdSize >= 8) 11106 return LdSize == 16 ? ARM::VLD1q32wb_fixed 11107 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 11108 if (IsThumb1) 11109 return LdSize == 4 ? ARM::tLDRi 11110 : LdSize == 2 ? ARM::tLDRHi 11111 : LdSize == 1 ? ARM::tLDRBi : 0; 11112 if (IsThumb2) 11113 return LdSize == 4 ? ARM::t2LDR_POST 11114 : LdSize == 2 ? ARM::t2LDRH_POST 11115 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 11116 return LdSize == 4 ? ARM::LDR_POST_IMM 11117 : LdSize == 2 ? ARM::LDRH_POST 11118 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 11119 } 11120 11121 /// Return the store opcode for a given store size. If store size >= 8, 11122 /// neon opcode will be returned. 11123 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 11124 if (StSize >= 8) 11125 return StSize == 16 ? ARM::VST1q32wb_fixed 11126 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 11127 if (IsThumb1) 11128 return StSize == 4 ? ARM::tSTRi 11129 : StSize == 2 ? ARM::tSTRHi 11130 : StSize == 1 ? ARM::tSTRBi : 0; 11131 if (IsThumb2) 11132 return StSize == 4 ? ARM::t2STR_POST 11133 : StSize == 2 ? ARM::t2STRH_POST 11134 : StSize == 1 ? ARM::t2STRB_POST : 0; 11135 return StSize == 4 ? ARM::STR_POST_IMM 11136 : StSize == 2 ? ARM::STRH_POST 11137 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 11138 } 11139 11140 /// Emit a post-increment load operation with given size. The instructions 11141 /// will be added to BB at Pos. 11142 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 11143 const TargetInstrInfo *TII, const DebugLoc &dl, 11144 unsigned LdSize, unsigned Data, unsigned AddrIn, 11145 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 11146 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 11147 assert(LdOpc != 0 && "Should have a load opcode"); 11148 if (LdSize >= 8) { 11149 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 11150 .addReg(AddrOut, RegState::Define) 11151 .addReg(AddrIn) 11152 .addImm(0) 11153 .add(predOps(ARMCC::AL)); 11154 } else if (IsThumb1) { 11155 // load + update AddrIn 11156 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 11157 .addReg(AddrIn) 11158 .addImm(0) 11159 .add(predOps(ARMCC::AL)); 11160 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 11161 .add(t1CondCodeOp()) 11162 .addReg(AddrIn) 11163 .addImm(LdSize) 11164 .add(predOps(ARMCC::AL)); 11165 } else if (IsThumb2) { 11166 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 11167 .addReg(AddrOut, RegState::Define) 11168 .addReg(AddrIn) 11169 .addImm(LdSize) 11170 .add(predOps(ARMCC::AL)); 11171 } else { // arm 11172 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 11173 .addReg(AddrOut, RegState::Define) 11174 .addReg(AddrIn) 11175 .addReg(0) 11176 .addImm(LdSize) 11177 .add(predOps(ARMCC::AL)); 11178 } 11179 } 11180 11181 /// Emit a post-increment store operation with given size. The instructions 11182 /// will be added to BB at Pos. 11183 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 11184 const TargetInstrInfo *TII, const DebugLoc &dl, 11185 unsigned StSize, unsigned Data, unsigned AddrIn, 11186 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 11187 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 11188 assert(StOpc != 0 && "Should have a store opcode"); 11189 if (StSize >= 8) { 11190 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 11191 .addReg(AddrIn) 11192 .addImm(0) 11193 .addReg(Data) 11194 .add(predOps(ARMCC::AL)); 11195 } else if (IsThumb1) { 11196 // store + update AddrIn 11197 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 11198 .addReg(Data) 11199 .addReg(AddrIn) 11200 .addImm(0) 11201 .add(predOps(ARMCC::AL)); 11202 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 11203 .add(t1CondCodeOp()) 11204 .addReg(AddrIn) 11205 .addImm(StSize) 11206 .add(predOps(ARMCC::AL)); 11207 } else if (IsThumb2) { 11208 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 11209 .addReg(Data) 11210 .addReg(AddrIn) 11211 .addImm(StSize) 11212 .add(predOps(ARMCC::AL)); 11213 } else { // arm 11214 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 11215 .addReg(Data) 11216 .addReg(AddrIn) 11217 .addReg(0) 11218 .addImm(StSize) 11219 .add(predOps(ARMCC::AL)); 11220 } 11221 } 11222 11223 MachineBasicBlock * 11224 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 11225 MachineBasicBlock *BB) const { 11226 // This pseudo instruction has 3 operands: dst, src, size 11227 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 11228 // Otherwise, we will generate unrolled scalar copies. 11229 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11230 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11231 MachineFunction::iterator It = ++BB->getIterator(); 11232 11233 Register dest = MI.getOperand(0).getReg(); 11234 Register src = MI.getOperand(1).getReg(); 11235 unsigned SizeVal = MI.getOperand(2).getImm(); 11236 unsigned Alignment = MI.getOperand(3).getImm(); 11237 DebugLoc dl = MI.getDebugLoc(); 11238 11239 MachineFunction *MF = BB->getParent(); 11240 MachineRegisterInfo &MRI = MF->getRegInfo(); 11241 unsigned UnitSize = 0; 11242 const TargetRegisterClass *TRC = nullptr; 11243 const TargetRegisterClass *VecTRC = nullptr; 11244 11245 bool IsThumb1 = Subtarget->isThumb1Only(); 11246 bool IsThumb2 = Subtarget->isThumb2(); 11247 bool IsThumb = Subtarget->isThumb(); 11248 11249 if (Alignment & 1) { 11250 UnitSize = 1; 11251 } else if (Alignment & 2) { 11252 UnitSize = 2; 11253 } else { 11254 // Check whether we can use NEON instructions. 11255 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 11256 Subtarget->hasNEON()) { 11257 if ((Alignment % 16 == 0) && SizeVal >= 16) 11258 UnitSize = 16; 11259 else if ((Alignment % 8 == 0) && SizeVal >= 8) 11260 UnitSize = 8; 11261 } 11262 // Can't use NEON instructions. 11263 if (UnitSize == 0) 11264 UnitSize = 4; 11265 } 11266 11267 // Select the correct opcode and register class for unit size load/store 11268 bool IsNeon = UnitSize >= 8; 11269 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 11270 if (IsNeon) 11271 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 11272 : UnitSize == 8 ? &ARM::DPRRegClass 11273 : nullptr; 11274 11275 unsigned BytesLeft = SizeVal % UnitSize; 11276 unsigned LoopSize = SizeVal - BytesLeft; 11277 11278 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 11279 // Use LDR and STR to copy. 11280 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 11281 // [destOut] = STR_POST(scratch, destIn, UnitSize) 11282 unsigned srcIn = src; 11283 unsigned destIn = dest; 11284 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 11285 Register srcOut = MRI.createVirtualRegister(TRC); 11286 Register destOut = MRI.createVirtualRegister(TRC); 11287 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 11288 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 11289 IsThumb1, IsThumb2); 11290 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 11291 IsThumb1, IsThumb2); 11292 srcIn = srcOut; 11293 destIn = destOut; 11294 } 11295 11296 // Handle the leftover bytes with LDRB and STRB. 11297 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 11298 // [destOut] = STRB_POST(scratch, destIn, 1) 11299 for (unsigned i = 0; i < BytesLeft; i++) { 11300 Register srcOut = MRI.createVirtualRegister(TRC); 11301 Register destOut = MRI.createVirtualRegister(TRC); 11302 Register scratch = MRI.createVirtualRegister(TRC); 11303 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 11304 IsThumb1, IsThumb2); 11305 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 11306 IsThumb1, IsThumb2); 11307 srcIn = srcOut; 11308 destIn = destOut; 11309 } 11310 MI.eraseFromParent(); // The instruction is gone now. 11311 return BB; 11312 } 11313 11314 // Expand the pseudo op to a loop. 11315 // thisMBB: 11316 // ... 11317 // movw varEnd, # --> with thumb2 11318 // movt varEnd, # 11319 // ldrcp varEnd, idx --> without thumb2 11320 // fallthrough --> loopMBB 11321 // loopMBB: 11322 // PHI varPhi, varEnd, varLoop 11323 // PHI srcPhi, src, srcLoop 11324 // PHI destPhi, dst, destLoop 11325 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 11326 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 11327 // subs varLoop, varPhi, #UnitSize 11328 // bne loopMBB 11329 // fallthrough --> exitMBB 11330 // exitMBB: 11331 // epilogue to handle left-over bytes 11332 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 11333 // [destOut] = STRB_POST(scratch, destLoop, 1) 11334 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11335 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11336 MF->insert(It, loopMBB); 11337 MF->insert(It, exitMBB); 11338 11339 // Transfer the remainder of BB and its successor edges to exitMBB. 11340 exitMBB->splice(exitMBB->begin(), BB, 11341 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11342 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 11343 11344 // Load an immediate to varEnd. 11345 Register varEnd = MRI.createVirtualRegister(TRC); 11346 if (Subtarget->useMovt()) { 11347 unsigned Vtmp = varEnd; 11348 if ((LoopSize & 0xFFFF0000) != 0) 11349 Vtmp = MRI.createVirtualRegister(TRC); 11350 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 11351 .addImm(LoopSize & 0xFFFF) 11352 .add(predOps(ARMCC::AL)); 11353 11354 if ((LoopSize & 0xFFFF0000) != 0) 11355 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 11356 .addReg(Vtmp) 11357 .addImm(LoopSize >> 16) 11358 .add(predOps(ARMCC::AL)); 11359 } else { 11360 MachineConstantPool *ConstantPool = MF->getConstantPool(); 11361 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 11362 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 11363 11364 // MachineConstantPool wants an explicit alignment. 11365 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 11366 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 11367 MachineMemOperand *CPMMO = 11368 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 11369 MachineMemOperand::MOLoad, 4, Align(4)); 11370 11371 if (IsThumb) 11372 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 11373 .addReg(varEnd, RegState::Define) 11374 .addConstantPoolIndex(Idx) 11375 .add(predOps(ARMCC::AL)) 11376 .addMemOperand(CPMMO); 11377 else 11378 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 11379 .addReg(varEnd, RegState::Define) 11380 .addConstantPoolIndex(Idx) 11381 .addImm(0) 11382 .add(predOps(ARMCC::AL)) 11383 .addMemOperand(CPMMO); 11384 } 11385 BB->addSuccessor(loopMBB); 11386 11387 // Generate the loop body: 11388 // varPhi = PHI(varLoop, varEnd) 11389 // srcPhi = PHI(srcLoop, src) 11390 // destPhi = PHI(destLoop, dst) 11391 MachineBasicBlock *entryBB = BB; 11392 BB = loopMBB; 11393 Register varLoop = MRI.createVirtualRegister(TRC); 11394 Register varPhi = MRI.createVirtualRegister(TRC); 11395 Register srcLoop = MRI.createVirtualRegister(TRC); 11396 Register srcPhi = MRI.createVirtualRegister(TRC); 11397 Register destLoop = MRI.createVirtualRegister(TRC); 11398 Register destPhi = MRI.createVirtualRegister(TRC); 11399 11400 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 11401 .addReg(varLoop).addMBB(loopMBB) 11402 .addReg(varEnd).addMBB(entryBB); 11403 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 11404 .addReg(srcLoop).addMBB(loopMBB) 11405 .addReg(src).addMBB(entryBB); 11406 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 11407 .addReg(destLoop).addMBB(loopMBB) 11408 .addReg(dest).addMBB(entryBB); 11409 11410 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 11411 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 11412 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 11413 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 11414 IsThumb1, IsThumb2); 11415 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 11416 IsThumb1, IsThumb2); 11417 11418 // Decrement loop variable by UnitSize. 11419 if (IsThumb1) { 11420 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 11421 .add(t1CondCodeOp()) 11422 .addReg(varPhi) 11423 .addImm(UnitSize) 11424 .add(predOps(ARMCC::AL)); 11425 } else { 11426 MachineInstrBuilder MIB = 11427 BuildMI(*BB, BB->end(), dl, 11428 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 11429 MIB.addReg(varPhi) 11430 .addImm(UnitSize) 11431 .add(predOps(ARMCC::AL)) 11432 .add(condCodeOp()); 11433 MIB->getOperand(5).setReg(ARM::CPSR); 11434 MIB->getOperand(5).setIsDef(true); 11435 } 11436 BuildMI(*BB, BB->end(), dl, 11437 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 11438 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 11439 11440 // loopMBB can loop back to loopMBB or fall through to exitMBB. 11441 BB->addSuccessor(loopMBB); 11442 BB->addSuccessor(exitMBB); 11443 11444 // Add epilogue to handle BytesLeft. 11445 BB = exitMBB; 11446 auto StartOfExit = exitMBB->begin(); 11447 11448 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 11449 // [destOut] = STRB_POST(scratch, destLoop, 1) 11450 unsigned srcIn = srcLoop; 11451 unsigned destIn = destLoop; 11452 for (unsigned i = 0; i < BytesLeft; i++) { 11453 Register srcOut = MRI.createVirtualRegister(TRC); 11454 Register destOut = MRI.createVirtualRegister(TRC); 11455 Register scratch = MRI.createVirtualRegister(TRC); 11456 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 11457 IsThumb1, IsThumb2); 11458 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 11459 IsThumb1, IsThumb2); 11460 srcIn = srcOut; 11461 destIn = destOut; 11462 } 11463 11464 MI.eraseFromParent(); // The instruction is gone now. 11465 return BB; 11466 } 11467 11468 MachineBasicBlock * 11469 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 11470 MachineBasicBlock *MBB) const { 11471 const TargetMachine &TM = getTargetMachine(); 11472 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 11473 DebugLoc DL = MI.getDebugLoc(); 11474 11475 assert(Subtarget->isTargetWindows() && 11476 "__chkstk is only supported on Windows"); 11477 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 11478 11479 // __chkstk takes the number of words to allocate on the stack in R4, and 11480 // returns the stack adjustment in number of bytes in R4. This will not 11481 // clober any other registers (other than the obvious lr). 11482 // 11483 // Although, technically, IP should be considered a register which may be 11484 // clobbered, the call itself will not touch it. Windows on ARM is a pure 11485 // thumb-2 environment, so there is no interworking required. As a result, we 11486 // do not expect a veneer to be emitted by the linker, clobbering IP. 11487 // 11488 // Each module receives its own copy of __chkstk, so no import thunk is 11489 // required, again, ensuring that IP is not clobbered. 11490 // 11491 // Finally, although some linkers may theoretically provide a trampoline for 11492 // out of range calls (which is quite common due to a 32M range limitation of 11493 // branches for Thumb), we can generate the long-call version via 11494 // -mcmodel=large, alleviating the need for the trampoline which may clobber 11495 // IP. 11496 11497 switch (TM.getCodeModel()) { 11498 case CodeModel::Tiny: 11499 llvm_unreachable("Tiny code model not available on ARM."); 11500 case CodeModel::Small: 11501 case CodeModel::Medium: 11502 case CodeModel::Kernel: 11503 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 11504 .add(predOps(ARMCC::AL)) 11505 .addExternalSymbol("__chkstk") 11506 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 11507 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 11508 .addReg(ARM::R12, 11509 RegState::Implicit | RegState::Define | RegState::Dead) 11510 .addReg(ARM::CPSR, 11511 RegState::Implicit | RegState::Define | RegState::Dead); 11512 break; 11513 case CodeModel::Large: { 11514 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 11515 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11516 11517 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 11518 .addExternalSymbol("__chkstk"); 11519 BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent()))) 11520 .add(predOps(ARMCC::AL)) 11521 .addReg(Reg, RegState::Kill) 11522 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 11523 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 11524 .addReg(ARM::R12, 11525 RegState::Implicit | RegState::Define | RegState::Dead) 11526 .addReg(ARM::CPSR, 11527 RegState::Implicit | RegState::Define | RegState::Dead); 11528 break; 11529 } 11530 } 11531 11532 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 11533 .addReg(ARM::SP, RegState::Kill) 11534 .addReg(ARM::R4, RegState::Kill) 11535 .setMIFlags(MachineInstr::FrameSetup) 11536 .add(predOps(ARMCC::AL)) 11537 .add(condCodeOp()); 11538 11539 MI.eraseFromParent(); 11540 return MBB; 11541 } 11542 11543 MachineBasicBlock * 11544 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 11545 MachineBasicBlock *MBB) const { 11546 DebugLoc DL = MI.getDebugLoc(); 11547 MachineFunction *MF = MBB->getParent(); 11548 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11549 11550 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 11551 MF->insert(++MBB->getIterator(), ContBB); 11552 ContBB->splice(ContBB->begin(), MBB, 11553 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 11554 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 11555 MBB->addSuccessor(ContBB); 11556 11557 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 11558 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 11559 MF->push_back(TrapBB); 11560 MBB->addSuccessor(TrapBB); 11561 11562 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 11563 .addReg(MI.getOperand(0).getReg()) 11564 .addImm(0) 11565 .add(predOps(ARMCC::AL)); 11566 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 11567 .addMBB(TrapBB) 11568 .addImm(ARMCC::EQ) 11569 .addReg(ARM::CPSR); 11570 11571 MI.eraseFromParent(); 11572 return ContBB; 11573 } 11574 11575 // The CPSR operand of SelectItr might be missing a kill marker 11576 // because there were multiple uses of CPSR, and ISel didn't know 11577 // which to mark. Figure out whether SelectItr should have had a 11578 // kill marker, and set it if it should. Returns the correct kill 11579 // marker value. 11580 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 11581 MachineBasicBlock* BB, 11582 const TargetRegisterInfo* TRI) { 11583 // Scan forward through BB for a use/def of CPSR. 11584 MachineBasicBlock::iterator miI(std::next(SelectItr)); 11585 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 11586 const MachineInstr& mi = *miI; 11587 if (mi.readsRegister(ARM::CPSR)) 11588 return false; 11589 if (mi.definesRegister(ARM::CPSR)) 11590 break; // Should have kill-flag - update below. 11591 } 11592 11593 // If we hit the end of the block, check whether CPSR is live into a 11594 // successor. 11595 if (miI == BB->end()) { 11596 for (MachineBasicBlock *Succ : BB->successors()) 11597 if (Succ->isLiveIn(ARM::CPSR)) 11598 return false; 11599 } 11600 11601 // We found a def, or hit the end of the basic block and CPSR wasn't live 11602 // out. SelectMI should have a kill flag on CPSR. 11603 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 11604 return true; 11605 } 11606 11607 /// Adds logic in loop entry MBB to calculate loop iteration count and adds 11608 /// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop 11609 static Register genTPEntry(MachineBasicBlock *TpEntry, 11610 MachineBasicBlock *TpLoopBody, 11611 MachineBasicBlock *TpExit, Register OpSizeReg, 11612 const TargetInstrInfo *TII, DebugLoc Dl, 11613 MachineRegisterInfo &MRI) { 11614 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4. 11615 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11616 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg) 11617 .addUse(OpSizeReg) 11618 .addImm(15) 11619 .add(predOps(ARMCC::AL)) 11620 .addReg(0); 11621 11622 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11623 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg) 11624 .addUse(AddDestReg, RegState::Kill) 11625 .addImm(4) 11626 .add(predOps(ARMCC::AL)) 11627 .addReg(0); 11628 11629 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11630 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg) 11631 .addUse(LsrDestReg, RegState::Kill); 11632 11633 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart)) 11634 .addUse(TotalIterationsReg) 11635 .addMBB(TpExit); 11636 11637 BuildMI(TpEntry, Dl, TII->get(ARM::t2B)) 11638 .addMBB(TpLoopBody) 11639 .add(predOps(ARMCC::AL)); 11640 11641 return TotalIterationsReg; 11642 } 11643 11644 /// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and 11645 /// t2DoLoopEnd. These are used by later passes to generate tail predicated 11646 /// loops. 11647 static void genTPLoopBody(MachineBasicBlock *TpLoopBody, 11648 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, 11649 const TargetInstrInfo *TII, DebugLoc Dl, 11650 MachineRegisterInfo &MRI, Register OpSrcReg, 11651 Register OpDestReg, Register ElementCountReg, 11652 Register TotalIterationsReg, bool IsMemcpy) { 11653 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest 11654 // array, loop iteration counter, predication counter. 11655 11656 Register SrcPhiReg, CurrSrcReg; 11657 if (IsMemcpy) { 11658 // Current position in the src array 11659 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11660 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11661 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg) 11662 .addUse(OpSrcReg) 11663 .addMBB(TpEntry) 11664 .addUse(CurrSrcReg) 11665 .addMBB(TpLoopBody); 11666 } 11667 11668 // Current position in the dest array 11669 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11670 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11671 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg) 11672 .addUse(OpDestReg) 11673 .addMBB(TpEntry) 11674 .addUse(CurrDestReg) 11675 .addMBB(TpLoopBody); 11676 11677 // Current loop counter 11678 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11679 Register RemainingLoopIterationsReg = 11680 MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11681 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg) 11682 .addUse(TotalIterationsReg) 11683 .addMBB(TpEntry) 11684 .addUse(RemainingLoopIterationsReg) 11685 .addMBB(TpLoopBody); 11686 11687 // Predication counter 11688 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11689 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11690 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg) 11691 .addUse(ElementCountReg) 11692 .addMBB(TpEntry) 11693 .addUse(RemainingElementsReg) 11694 .addMBB(TpLoopBody); 11695 11696 // Pass predication counter to VCTP 11697 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass); 11698 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg) 11699 .addUse(PredCounterPhiReg) 11700 .addImm(ARMVCC::None) 11701 .addReg(0) 11702 .addReg(0); 11703 11704 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg) 11705 .addUse(PredCounterPhiReg) 11706 .addImm(16) 11707 .add(predOps(ARMCC::AL)) 11708 .addReg(0); 11709 11710 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR 11711 Register SrcValueReg; 11712 if (IsMemcpy) { 11713 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass); 11714 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post)) 11715 .addDef(CurrSrcReg) 11716 .addDef(SrcValueReg) 11717 .addReg(SrcPhiReg) 11718 .addImm(16) 11719 .addImm(ARMVCC::Then) 11720 .addUse(VccrReg) 11721 .addReg(0); 11722 } else 11723 SrcValueReg = OpSrcReg; 11724 11725 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post)) 11726 .addDef(CurrDestReg) 11727 .addUse(SrcValueReg) 11728 .addReg(DestPhiReg) 11729 .addImm(16) 11730 .addImm(ARMVCC::Then) 11731 .addUse(VccrReg) 11732 .addReg(0); 11733 11734 // Add the pseudoInstrs for decrementing the loop counter and marking the 11735 // end:t2DoLoopDec and t2DoLoopEnd 11736 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg) 11737 .addUse(LoopCounterPhiReg) 11738 .addImm(1); 11739 11740 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd)) 11741 .addUse(RemainingLoopIterationsReg) 11742 .addMBB(TpLoopBody); 11743 11744 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B)) 11745 .addMBB(TpExit) 11746 .add(predOps(ARMCC::AL)); 11747 } 11748 11749 MachineBasicBlock * 11750 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 11751 MachineBasicBlock *BB) const { 11752 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11753 DebugLoc dl = MI.getDebugLoc(); 11754 bool isThumb2 = Subtarget->isThumb2(); 11755 switch (MI.getOpcode()) { 11756 default: { 11757 MI.print(errs()); 11758 llvm_unreachable("Unexpected instr type to insert"); 11759 } 11760 11761 // Thumb1 post-indexed loads are really just single-register LDMs. 11762 case ARM::tLDR_postidx: { 11763 MachineOperand Def(MI.getOperand(1)); 11764 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 11765 .add(Def) // Rn_wb 11766 .add(MI.getOperand(2)) // Rn 11767 .add(MI.getOperand(3)) // PredImm 11768 .add(MI.getOperand(4)) // PredReg 11769 .add(MI.getOperand(0)) // Rt 11770 .cloneMemRefs(MI); 11771 MI.eraseFromParent(); 11772 return BB; 11773 } 11774 11775 case ARM::MVE_MEMCPYLOOPINST: 11776 case ARM::MVE_MEMSETLOOPINST: { 11777 11778 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo 11779 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate 11780 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and 11781 // adds the relevant instructions in the TP loop Body for generation of a 11782 // WLSTP loop. 11783 11784 // Below is relevant portion of the CFG after the transformation. 11785 // The Machine Basic Blocks are shown along with branch conditions (in 11786 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this 11787 // portion of the CFG and may not necessarily be the entry/exit of the 11788 // function. 11789 11790 // (Relevant) CFG after transformation: 11791 // TP entry MBB 11792 // | 11793 // |-----------------| 11794 // (n <= 0) (n > 0) 11795 // | | 11796 // | TP loop Body MBB<--| 11797 // | | | 11798 // \ |___________| 11799 // \ / 11800 // TP exit MBB 11801 11802 MachineFunction *MF = BB->getParent(); 11803 MachineFunctionProperties &Properties = MF->getProperties(); 11804 MachineRegisterInfo &MRI = MF->getRegInfo(); 11805 11806 Register OpDestReg = MI.getOperand(0).getReg(); 11807 Register OpSrcReg = MI.getOperand(1).getReg(); 11808 Register OpSizeReg = MI.getOperand(2).getReg(); 11809 11810 // Allocate the required MBBs and add to parent function. 11811 MachineBasicBlock *TpEntry = BB; 11812 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock(); 11813 MachineBasicBlock *TpExit; 11814 11815 MF->push_back(TpLoopBody); 11816 11817 // If any instructions are present in the current block after 11818 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and 11819 // move the instructions into the newly created exit block. If there are no 11820 // instructions add an explicit branch to the FallThrough block and then 11821 // split. 11822 // 11823 // The split is required for two reasons: 11824 // 1) A terminator(t2WhileLoopStart) will be placed at that site. 11825 // 2) Since a TPLoopBody will be added later, any phis in successive blocks 11826 // need to be updated. splitAt() already handles this. 11827 TpExit = BB->splitAt(MI, false); 11828 if (TpExit == BB) { 11829 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the " 11830 "block containing memcpy/memset Pseudo"); 11831 TpExit = BB->getFallThrough(); 11832 BuildMI(BB, dl, TII->get(ARM::t2B)) 11833 .addMBB(TpExit) 11834 .add(predOps(ARMCC::AL)); 11835 TpExit = BB->splitAt(MI, false); 11836 } 11837 11838 // Add logic for iteration count 11839 Register TotalIterationsReg = 11840 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI); 11841 11842 // Add the vectorized (and predicated) loads/store instructions 11843 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST; 11844 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg, 11845 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy); 11846 11847 // Required to avoid conflict with the MachineVerifier during testing. 11848 Properties.reset(MachineFunctionProperties::Property::NoPHIs); 11849 11850 // Connect the blocks 11851 TpEntry->addSuccessor(TpLoopBody); 11852 TpLoopBody->addSuccessor(TpLoopBody); 11853 TpLoopBody->addSuccessor(TpExit); 11854 11855 // Reorder for a more natural layout 11856 TpLoopBody->moveAfter(TpEntry); 11857 TpExit->moveAfter(TpLoopBody); 11858 11859 // Finally, remove the memcpy Psuedo Instruction 11860 MI.eraseFromParent(); 11861 11862 // Return the exit block as it may contain other instructions requiring a 11863 // custom inserter 11864 return TpExit; 11865 } 11866 11867 // The Thumb2 pre-indexed stores have the same MI operands, they just 11868 // define them differently in the .td files from the isel patterns, so 11869 // they need pseudos. 11870 case ARM::t2STR_preidx: 11871 MI.setDesc(TII->get(ARM::t2STR_PRE)); 11872 return BB; 11873 case ARM::t2STRB_preidx: 11874 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 11875 return BB; 11876 case ARM::t2STRH_preidx: 11877 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 11878 return BB; 11879 11880 case ARM::STRi_preidx: 11881 case ARM::STRBi_preidx: { 11882 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 11883 : ARM::STRB_PRE_IMM; 11884 // Decode the offset. 11885 unsigned Offset = MI.getOperand(4).getImm(); 11886 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 11887 Offset = ARM_AM::getAM2Offset(Offset); 11888 if (isSub) 11889 Offset = -Offset; 11890 11891 MachineMemOperand *MMO = *MI.memoperands_begin(); 11892 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 11893 .add(MI.getOperand(0)) // Rn_wb 11894 .add(MI.getOperand(1)) // Rt 11895 .add(MI.getOperand(2)) // Rn 11896 .addImm(Offset) // offset (skip GPR==zero_reg) 11897 .add(MI.getOperand(5)) // pred 11898 .add(MI.getOperand(6)) 11899 .addMemOperand(MMO); 11900 MI.eraseFromParent(); 11901 return BB; 11902 } 11903 case ARM::STRr_preidx: 11904 case ARM::STRBr_preidx: 11905 case ARM::STRH_preidx: { 11906 unsigned NewOpc; 11907 switch (MI.getOpcode()) { 11908 default: llvm_unreachable("unexpected opcode!"); 11909 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 11910 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 11911 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 11912 } 11913 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 11914 for (const MachineOperand &MO : MI.operands()) 11915 MIB.add(MO); 11916 MI.eraseFromParent(); 11917 return BB; 11918 } 11919 11920 case ARM::tMOVCCr_pseudo: { 11921 // To "insert" a SELECT_CC instruction, we actually have to insert the 11922 // diamond control-flow pattern. The incoming instruction knows the 11923 // destination vreg to set, the condition code register to branch on, the 11924 // true/false values to select between, and a branch opcode to use. 11925 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11926 MachineFunction::iterator It = ++BB->getIterator(); 11927 11928 // thisMBB: 11929 // ... 11930 // TrueVal = ... 11931 // cmpTY ccX, r1, r2 11932 // bCC copy1MBB 11933 // fallthrough --> copy0MBB 11934 MachineBasicBlock *thisMBB = BB; 11935 MachineFunction *F = BB->getParent(); 11936 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 11937 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 11938 F->insert(It, copy0MBB); 11939 F->insert(It, sinkMBB); 11940 11941 // Check whether CPSR is live past the tMOVCCr_pseudo. 11942 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 11943 if (!MI.killsRegister(ARM::CPSR) && 11944 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 11945 copy0MBB->addLiveIn(ARM::CPSR); 11946 sinkMBB->addLiveIn(ARM::CPSR); 11947 } 11948 11949 // Transfer the remainder of BB and its successor edges to sinkMBB. 11950 sinkMBB->splice(sinkMBB->begin(), BB, 11951 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11952 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 11953 11954 BB->addSuccessor(copy0MBB); 11955 BB->addSuccessor(sinkMBB); 11956 11957 BuildMI(BB, dl, TII->get(ARM::tBcc)) 11958 .addMBB(sinkMBB) 11959 .addImm(MI.getOperand(3).getImm()) 11960 .addReg(MI.getOperand(4).getReg()); 11961 11962 // copy0MBB: 11963 // %FalseValue = ... 11964 // # fallthrough to sinkMBB 11965 BB = copy0MBB; 11966 11967 // Update machine-CFG edges 11968 BB->addSuccessor(sinkMBB); 11969 11970 // sinkMBB: 11971 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 11972 // ... 11973 BB = sinkMBB; 11974 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 11975 .addReg(MI.getOperand(1).getReg()) 11976 .addMBB(copy0MBB) 11977 .addReg(MI.getOperand(2).getReg()) 11978 .addMBB(thisMBB); 11979 11980 MI.eraseFromParent(); // The pseudo instruction is gone now. 11981 return BB; 11982 } 11983 11984 case ARM::BCCi64: 11985 case ARM::BCCZi64: { 11986 // If there is an unconditional branch to the other successor, remove it. 11987 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11988 11989 // Compare both parts that make up the double comparison separately for 11990 // equality. 11991 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 11992 11993 Register LHS1 = MI.getOperand(1).getReg(); 11994 Register LHS2 = MI.getOperand(2).getReg(); 11995 if (RHSisZero) { 11996 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 11997 .addReg(LHS1) 11998 .addImm(0) 11999 .add(predOps(ARMCC::AL)); 12000 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 12001 .addReg(LHS2).addImm(0) 12002 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 12003 } else { 12004 Register RHS1 = MI.getOperand(3).getReg(); 12005 Register RHS2 = MI.getOperand(4).getReg(); 12006 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 12007 .addReg(LHS1) 12008 .addReg(RHS1) 12009 .add(predOps(ARMCC::AL)); 12010 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 12011 .addReg(LHS2).addReg(RHS2) 12012 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 12013 } 12014 12015 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 12016 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 12017 if (MI.getOperand(0).getImm() == ARMCC::NE) 12018 std::swap(destMBB, exitMBB); 12019 12020 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 12021 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 12022 if (isThumb2) 12023 BuildMI(BB, dl, TII->get(ARM::t2B)) 12024 .addMBB(exitMBB) 12025 .add(predOps(ARMCC::AL)); 12026 else 12027 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 12028 12029 MI.eraseFromParent(); // The pseudo instruction is gone now. 12030 return BB; 12031 } 12032 12033 case ARM::Int_eh_sjlj_setjmp: 12034 case ARM::Int_eh_sjlj_setjmp_nofp: 12035 case ARM::tInt_eh_sjlj_setjmp: 12036 case ARM::t2Int_eh_sjlj_setjmp: 12037 case ARM::t2Int_eh_sjlj_setjmp_nofp: 12038 return BB; 12039 12040 case ARM::Int_eh_sjlj_setup_dispatch: 12041 EmitSjLjDispatchBlock(MI, BB); 12042 return BB; 12043 12044 case ARM::ABS: 12045 case ARM::t2ABS: { 12046 // To insert an ABS instruction, we have to insert the 12047 // diamond control-flow pattern. The incoming instruction knows the 12048 // source vreg to test against 0, the destination vreg to set, 12049 // the condition code register to branch on, the 12050 // true/false values to select between, and a branch opcode to use. 12051 // It transforms 12052 // V1 = ABS V0 12053 // into 12054 // V2 = MOVS V0 12055 // BCC (branch to SinkBB if V0 >= 0) 12056 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 12057 // SinkBB: V1 = PHI(V2, V3) 12058 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 12059 MachineFunction::iterator BBI = ++BB->getIterator(); 12060 MachineFunction *Fn = BB->getParent(); 12061 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 12062 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 12063 Fn->insert(BBI, RSBBB); 12064 Fn->insert(BBI, SinkBB); 12065 12066 Register ABSSrcReg = MI.getOperand(1).getReg(); 12067 Register ABSDstReg = MI.getOperand(0).getReg(); 12068 bool ABSSrcKIll = MI.getOperand(1).isKill(); 12069 bool isThumb2 = Subtarget->isThumb2(); 12070 MachineRegisterInfo &MRI = Fn->getRegInfo(); 12071 // In Thumb mode S must not be specified if source register is the SP or 12072 // PC and if destination register is the SP, so restrict register class 12073 Register NewRsbDstReg = MRI.createVirtualRegister( 12074 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 12075 12076 // Transfer the remainder of BB and its successor edges to sinkMBB. 12077 SinkBB->splice(SinkBB->begin(), BB, 12078 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 12079 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 12080 12081 BB->addSuccessor(RSBBB); 12082 BB->addSuccessor(SinkBB); 12083 12084 // fall through to SinkMBB 12085 RSBBB->addSuccessor(SinkBB); 12086 12087 // insert a cmp at the end of BB 12088 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 12089 .addReg(ABSSrcReg) 12090 .addImm(0) 12091 .add(predOps(ARMCC::AL)); 12092 12093 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 12094 BuildMI(BB, dl, 12095 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 12096 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 12097 12098 // insert rsbri in RSBBB 12099 // Note: BCC and rsbri will be converted into predicated rsbmi 12100 // by if-conversion pass 12101 BuildMI(*RSBBB, RSBBB->begin(), dl, 12102 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 12103 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 12104 .addImm(0) 12105 .add(predOps(ARMCC::AL)) 12106 .add(condCodeOp()); 12107 12108 // insert PHI in SinkBB, 12109 // reuse ABSDstReg to not change uses of ABS instruction 12110 BuildMI(*SinkBB, SinkBB->begin(), dl, 12111 TII->get(ARM::PHI), ABSDstReg) 12112 .addReg(NewRsbDstReg).addMBB(RSBBB) 12113 .addReg(ABSSrcReg).addMBB(BB); 12114 12115 // remove ABS instruction 12116 MI.eraseFromParent(); 12117 12118 // return last added BB 12119 return SinkBB; 12120 } 12121 case ARM::COPY_STRUCT_BYVAL_I32: 12122 ++NumLoopByVals; 12123 return EmitStructByval(MI, BB); 12124 case ARM::WIN__CHKSTK: 12125 return EmitLowered__chkstk(MI, BB); 12126 case ARM::WIN__DBZCHK: 12127 return EmitLowered__dbzchk(MI, BB); 12128 } 12129 } 12130 12131 /// Attaches vregs to MEMCPY that it will use as scratch registers 12132 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 12133 /// instead of as a custom inserter because we need the use list from the SDNode. 12134 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 12135 MachineInstr &MI, const SDNode *Node) { 12136 bool isThumb1 = Subtarget->isThumb1Only(); 12137 12138 DebugLoc DL = MI.getDebugLoc(); 12139 MachineFunction *MF = MI.getParent()->getParent(); 12140 MachineRegisterInfo &MRI = MF->getRegInfo(); 12141 MachineInstrBuilder MIB(*MF, MI); 12142 12143 // If the new dst/src is unused mark it as dead. 12144 if (!Node->hasAnyUseOfValue(0)) { 12145 MI.getOperand(0).setIsDead(true); 12146 } 12147 if (!Node->hasAnyUseOfValue(1)) { 12148 MI.getOperand(1).setIsDead(true); 12149 } 12150 12151 // The MEMCPY both defines and kills the scratch registers. 12152 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 12153 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 12154 : &ARM::GPRRegClass); 12155 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 12156 } 12157 } 12158 12159 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 12160 SDNode *Node) const { 12161 if (MI.getOpcode() == ARM::MEMCPY) { 12162 attachMEMCPYScratchRegs(Subtarget, MI, Node); 12163 return; 12164 } 12165 12166 const MCInstrDesc *MCID = &MI.getDesc(); 12167 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 12168 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 12169 // operand is still set to noreg. If needed, set the optional operand's 12170 // register to CPSR, and remove the redundant implicit def. 12171 // 12172 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 12173 12174 // Rename pseudo opcodes. 12175 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 12176 unsigned ccOutIdx; 12177 if (NewOpc) { 12178 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 12179 MCID = &TII->get(NewOpc); 12180 12181 assert(MCID->getNumOperands() == 12182 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 12183 && "converted opcode should be the same except for cc_out" 12184 " (and, on Thumb1, pred)"); 12185 12186 MI.setDesc(*MCID); 12187 12188 // Add the optional cc_out operand 12189 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 12190 12191 // On Thumb1, move all input operands to the end, then add the predicate 12192 if (Subtarget->isThumb1Only()) { 12193 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 12194 MI.addOperand(MI.getOperand(1)); 12195 MI.removeOperand(1); 12196 } 12197 12198 // Restore the ties 12199 for (unsigned i = MI.getNumOperands(); i--;) { 12200 const MachineOperand& op = MI.getOperand(i); 12201 if (op.isReg() && op.isUse()) { 12202 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 12203 if (DefIdx != -1) 12204 MI.tieOperands(DefIdx, i); 12205 } 12206 } 12207 12208 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 12209 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 12210 ccOutIdx = 1; 12211 } else 12212 ccOutIdx = MCID->getNumOperands() - 1; 12213 } else 12214 ccOutIdx = MCID->getNumOperands() - 1; 12215 12216 // Any ARM instruction that sets the 's' bit should specify an optional 12217 // "cc_out" operand in the last operand position. 12218 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 12219 assert(!NewOpc && "Optional cc_out operand required"); 12220 return; 12221 } 12222 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 12223 // since we already have an optional CPSR def. 12224 bool definesCPSR = false; 12225 bool deadCPSR = false; 12226 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 12227 ++i) { 12228 const MachineOperand &MO = MI.getOperand(i); 12229 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 12230 definesCPSR = true; 12231 if (MO.isDead()) 12232 deadCPSR = true; 12233 MI.removeOperand(i); 12234 break; 12235 } 12236 } 12237 if (!definesCPSR) { 12238 assert(!NewOpc && "Optional cc_out operand required"); 12239 return; 12240 } 12241 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 12242 if (deadCPSR) { 12243 assert(!MI.getOperand(ccOutIdx).getReg() && 12244 "expect uninitialized optional cc_out operand"); 12245 // Thumb1 instructions must have the S bit even if the CPSR is dead. 12246 if (!Subtarget->isThumb1Only()) 12247 return; 12248 } 12249 12250 // If this instruction was defined with an optional CPSR def and its dag node 12251 // had a live implicit CPSR def, then activate the optional CPSR def. 12252 MachineOperand &MO = MI.getOperand(ccOutIdx); 12253 MO.setReg(ARM::CPSR); 12254 MO.setIsDef(true); 12255 } 12256 12257 //===----------------------------------------------------------------------===// 12258 // ARM Optimization Hooks 12259 //===----------------------------------------------------------------------===// 12260 12261 // Helper function that checks if N is a null or all ones constant. 12262 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 12263 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 12264 } 12265 12266 // Return true if N is conditionally 0 or all ones. 12267 // Detects these expressions where cc is an i1 value: 12268 // 12269 // (select cc 0, y) [AllOnes=0] 12270 // (select cc y, 0) [AllOnes=0] 12271 // (zext cc) [AllOnes=0] 12272 // (sext cc) [AllOnes=0/1] 12273 // (select cc -1, y) [AllOnes=1] 12274 // (select cc y, -1) [AllOnes=1] 12275 // 12276 // Invert is set when N is the null/all ones constant when CC is false. 12277 // OtherOp is set to the alternative value of N. 12278 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 12279 SDValue &CC, bool &Invert, 12280 SDValue &OtherOp, 12281 SelectionDAG &DAG) { 12282 switch (N->getOpcode()) { 12283 default: return false; 12284 case ISD::SELECT: { 12285 CC = N->getOperand(0); 12286 SDValue N1 = N->getOperand(1); 12287 SDValue N2 = N->getOperand(2); 12288 if (isZeroOrAllOnes(N1, AllOnes)) { 12289 Invert = false; 12290 OtherOp = N2; 12291 return true; 12292 } 12293 if (isZeroOrAllOnes(N2, AllOnes)) { 12294 Invert = true; 12295 OtherOp = N1; 12296 return true; 12297 } 12298 return false; 12299 } 12300 case ISD::ZERO_EXTEND: 12301 // (zext cc) can never be the all ones value. 12302 if (AllOnes) 12303 return false; 12304 LLVM_FALLTHROUGH; 12305 case ISD::SIGN_EXTEND: { 12306 SDLoc dl(N); 12307 EVT VT = N->getValueType(0); 12308 CC = N->getOperand(0); 12309 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 12310 return false; 12311 Invert = !AllOnes; 12312 if (AllOnes) 12313 // When looking for an AllOnes constant, N is an sext, and the 'other' 12314 // value is 0. 12315 OtherOp = DAG.getConstant(0, dl, VT); 12316 else if (N->getOpcode() == ISD::ZERO_EXTEND) 12317 // When looking for a 0 constant, N can be zext or sext. 12318 OtherOp = DAG.getConstant(1, dl, VT); 12319 else 12320 OtherOp = DAG.getAllOnesConstant(dl, VT); 12321 return true; 12322 } 12323 } 12324 } 12325 12326 // Combine a constant select operand into its use: 12327 // 12328 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 12329 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 12330 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 12331 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 12332 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 12333 // 12334 // The transform is rejected if the select doesn't have a constant operand that 12335 // is null, or all ones when AllOnes is set. 12336 // 12337 // Also recognize sext/zext from i1: 12338 // 12339 // (add (zext cc), x) -> (select cc (add x, 1), x) 12340 // (add (sext cc), x) -> (select cc (add x, -1), x) 12341 // 12342 // These transformations eventually create predicated instructions. 12343 // 12344 // @param N The node to transform. 12345 // @param Slct The N operand that is a select. 12346 // @param OtherOp The other N operand (x above). 12347 // @param DCI Context. 12348 // @param AllOnes Require the select constant to be all ones instead of null. 12349 // @returns The new node, or SDValue() on failure. 12350 static 12351 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 12352 TargetLowering::DAGCombinerInfo &DCI, 12353 bool AllOnes = false) { 12354 SelectionDAG &DAG = DCI.DAG; 12355 EVT VT = N->getValueType(0); 12356 SDValue NonConstantVal; 12357 SDValue CCOp; 12358 bool SwapSelectOps; 12359 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 12360 NonConstantVal, DAG)) 12361 return SDValue(); 12362 12363 // Slct is now know to be the desired identity constant when CC is true. 12364 SDValue TrueVal = OtherOp; 12365 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 12366 OtherOp, NonConstantVal); 12367 // Unless SwapSelectOps says CC should be false. 12368 if (SwapSelectOps) 12369 std::swap(TrueVal, FalseVal); 12370 12371 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 12372 CCOp, TrueVal, FalseVal); 12373 } 12374 12375 // Attempt combineSelectAndUse on each operand of a commutative operator N. 12376 static 12377 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 12378 TargetLowering::DAGCombinerInfo &DCI) { 12379 SDValue N0 = N->getOperand(0); 12380 SDValue N1 = N->getOperand(1); 12381 if (N0.getNode()->hasOneUse()) 12382 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 12383 return Result; 12384 if (N1.getNode()->hasOneUse()) 12385 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 12386 return Result; 12387 return SDValue(); 12388 } 12389 12390 static bool IsVUZPShuffleNode(SDNode *N) { 12391 // VUZP shuffle node. 12392 if (N->getOpcode() == ARMISD::VUZP) 12393 return true; 12394 12395 // "VUZP" on i32 is an alias for VTRN. 12396 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 12397 return true; 12398 12399 return false; 12400 } 12401 12402 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 12403 TargetLowering::DAGCombinerInfo &DCI, 12404 const ARMSubtarget *Subtarget) { 12405 // Look for ADD(VUZP.0, VUZP.1). 12406 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 12407 N0 == N1) 12408 return SDValue(); 12409 12410 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 12411 if (!N->getValueType(0).is64BitVector()) 12412 return SDValue(); 12413 12414 // Generate vpadd. 12415 SelectionDAG &DAG = DCI.DAG; 12416 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12417 SDLoc dl(N); 12418 SDNode *Unzip = N0.getNode(); 12419 EVT VT = N->getValueType(0); 12420 12421 SmallVector<SDValue, 8> Ops; 12422 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 12423 TLI.getPointerTy(DAG.getDataLayout()))); 12424 Ops.push_back(Unzip->getOperand(0)); 12425 Ops.push_back(Unzip->getOperand(1)); 12426 12427 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 12428 } 12429 12430 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 12431 TargetLowering::DAGCombinerInfo &DCI, 12432 const ARMSubtarget *Subtarget) { 12433 // Check for two extended operands. 12434 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 12435 N1.getOpcode() == ISD::SIGN_EXTEND) && 12436 !(N0.getOpcode() == ISD::ZERO_EXTEND && 12437 N1.getOpcode() == ISD::ZERO_EXTEND)) 12438 return SDValue(); 12439 12440 SDValue N00 = N0.getOperand(0); 12441 SDValue N10 = N1.getOperand(0); 12442 12443 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 12444 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 12445 N00 == N10) 12446 return SDValue(); 12447 12448 // We only recognize Q register paddl here; this can't be reached until 12449 // after type legalization. 12450 if (!N00.getValueType().is64BitVector() || 12451 !N0.getValueType().is128BitVector()) 12452 return SDValue(); 12453 12454 // Generate vpaddl. 12455 SelectionDAG &DAG = DCI.DAG; 12456 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12457 SDLoc dl(N); 12458 EVT VT = N->getValueType(0); 12459 12460 SmallVector<SDValue, 8> Ops; 12461 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 12462 unsigned Opcode; 12463 if (N0.getOpcode() == ISD::SIGN_EXTEND) 12464 Opcode = Intrinsic::arm_neon_vpaddls; 12465 else 12466 Opcode = Intrinsic::arm_neon_vpaddlu; 12467 Ops.push_back(DAG.getConstant(Opcode, dl, 12468 TLI.getPointerTy(DAG.getDataLayout()))); 12469 EVT ElemTy = N00.getValueType().getVectorElementType(); 12470 unsigned NumElts = VT.getVectorNumElements(); 12471 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 12472 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 12473 N00.getOperand(0), N00.getOperand(1)); 12474 Ops.push_back(Concat); 12475 12476 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 12477 } 12478 12479 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 12480 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 12481 // much easier to match. 12482 static SDValue 12483 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 12484 TargetLowering::DAGCombinerInfo &DCI, 12485 const ARMSubtarget *Subtarget) { 12486 // Only perform optimization if after legalize, and if NEON is available. We 12487 // also expected both operands to be BUILD_VECTORs. 12488 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 12489 || N0.getOpcode() != ISD::BUILD_VECTOR 12490 || N1.getOpcode() != ISD::BUILD_VECTOR) 12491 return SDValue(); 12492 12493 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 12494 EVT VT = N->getValueType(0); 12495 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 12496 return SDValue(); 12497 12498 // Check that the vector operands are of the right form. 12499 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 12500 // operands, where N is the size of the formed vector. 12501 // Each EXTRACT_VECTOR should have the same input vector and odd or even 12502 // index such that we have a pair wise add pattern. 12503 12504 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 12505 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12506 return SDValue(); 12507 SDValue Vec = N0->getOperand(0)->getOperand(0); 12508 SDNode *V = Vec.getNode(); 12509 unsigned nextIndex = 0; 12510 12511 // For each operands to the ADD which are BUILD_VECTORs, 12512 // check to see if each of their operands are an EXTRACT_VECTOR with 12513 // the same vector and appropriate index. 12514 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 12515 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 12516 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 12517 12518 SDValue ExtVec0 = N0->getOperand(i); 12519 SDValue ExtVec1 = N1->getOperand(i); 12520 12521 // First operand is the vector, verify its the same. 12522 if (V != ExtVec0->getOperand(0).getNode() || 12523 V != ExtVec1->getOperand(0).getNode()) 12524 return SDValue(); 12525 12526 // Second is the constant, verify its correct. 12527 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 12528 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 12529 12530 // For the constant, we want to see all the even or all the odd. 12531 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 12532 || C1->getZExtValue() != nextIndex+1) 12533 return SDValue(); 12534 12535 // Increment index. 12536 nextIndex+=2; 12537 } else 12538 return SDValue(); 12539 } 12540 12541 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 12542 // we're using the entire input vector, otherwise there's a size/legality 12543 // mismatch somewhere. 12544 if (nextIndex != Vec.getValueType().getVectorNumElements() || 12545 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 12546 return SDValue(); 12547 12548 // Create VPADDL node. 12549 SelectionDAG &DAG = DCI.DAG; 12550 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12551 12552 SDLoc dl(N); 12553 12554 // Build operand list. 12555 SmallVector<SDValue, 8> Ops; 12556 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 12557 TLI.getPointerTy(DAG.getDataLayout()))); 12558 12559 // Input is the vector. 12560 Ops.push_back(Vec); 12561 12562 // Get widened type and narrowed type. 12563 MVT widenType; 12564 unsigned numElem = VT.getVectorNumElements(); 12565 12566 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 12567 switch (inputLaneType.getSimpleVT().SimpleTy) { 12568 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 12569 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 12570 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 12571 default: 12572 llvm_unreachable("Invalid vector element type for padd optimization."); 12573 } 12574 12575 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 12576 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 12577 return DAG.getNode(ExtOp, dl, VT, tmp); 12578 } 12579 12580 static SDValue findMUL_LOHI(SDValue V) { 12581 if (V->getOpcode() == ISD::UMUL_LOHI || 12582 V->getOpcode() == ISD::SMUL_LOHI) 12583 return V; 12584 return SDValue(); 12585 } 12586 12587 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 12588 TargetLowering::DAGCombinerInfo &DCI, 12589 const ARMSubtarget *Subtarget) { 12590 if (!Subtarget->hasBaseDSP()) 12591 return SDValue(); 12592 12593 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 12594 // accumulates the product into a 64-bit value. The 16-bit values will 12595 // be sign extended somehow or SRA'd into 32-bit values 12596 // (addc (adde (mul 16bit, 16bit), lo), hi) 12597 SDValue Mul = AddcNode->getOperand(0); 12598 SDValue Lo = AddcNode->getOperand(1); 12599 if (Mul.getOpcode() != ISD::MUL) { 12600 Lo = AddcNode->getOperand(0); 12601 Mul = AddcNode->getOperand(1); 12602 if (Mul.getOpcode() != ISD::MUL) 12603 return SDValue(); 12604 } 12605 12606 SDValue SRA = AddeNode->getOperand(0); 12607 SDValue Hi = AddeNode->getOperand(1); 12608 if (SRA.getOpcode() != ISD::SRA) { 12609 SRA = AddeNode->getOperand(1); 12610 Hi = AddeNode->getOperand(0); 12611 if (SRA.getOpcode() != ISD::SRA) 12612 return SDValue(); 12613 } 12614 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 12615 if (Const->getZExtValue() != 31) 12616 return SDValue(); 12617 } else 12618 return SDValue(); 12619 12620 if (SRA.getOperand(0) != Mul) 12621 return SDValue(); 12622 12623 SelectionDAG &DAG = DCI.DAG; 12624 SDLoc dl(AddcNode); 12625 unsigned Opcode = 0; 12626 SDValue Op0; 12627 SDValue Op1; 12628 12629 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 12630 Opcode = ARMISD::SMLALBB; 12631 Op0 = Mul.getOperand(0); 12632 Op1 = Mul.getOperand(1); 12633 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 12634 Opcode = ARMISD::SMLALBT; 12635 Op0 = Mul.getOperand(0); 12636 Op1 = Mul.getOperand(1).getOperand(0); 12637 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 12638 Opcode = ARMISD::SMLALTB; 12639 Op0 = Mul.getOperand(0).getOperand(0); 12640 Op1 = Mul.getOperand(1); 12641 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 12642 Opcode = ARMISD::SMLALTT; 12643 Op0 = Mul->getOperand(0).getOperand(0); 12644 Op1 = Mul->getOperand(1).getOperand(0); 12645 } 12646 12647 if (!Op0 || !Op1) 12648 return SDValue(); 12649 12650 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 12651 Op0, Op1, Lo, Hi); 12652 // Replace the ADDs' nodes uses by the MLA node's values. 12653 SDValue HiMLALResult(SMLAL.getNode(), 1); 12654 SDValue LoMLALResult(SMLAL.getNode(), 0); 12655 12656 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 12657 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 12658 12659 // Return original node to notify the driver to stop replacing. 12660 SDValue resNode(AddcNode, 0); 12661 return resNode; 12662 } 12663 12664 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 12665 TargetLowering::DAGCombinerInfo &DCI, 12666 const ARMSubtarget *Subtarget) { 12667 // Look for multiply add opportunities. 12668 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 12669 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 12670 // a glue link from the first add to the second add. 12671 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 12672 // a S/UMLAL instruction. 12673 // UMUL_LOHI 12674 // / :lo \ :hi 12675 // V \ [no multiline comment] 12676 // loAdd -> ADDC | 12677 // \ :carry / 12678 // V V 12679 // ADDE <- hiAdd 12680 // 12681 // In the special case where only the higher part of a signed result is used 12682 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 12683 // a constant with the exact value of 0x80000000, we recognize we are dealing 12684 // with a "rounded multiply and add" (or subtract) and transform it into 12685 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 12686 12687 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 12688 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 12689 "Expect an ADDE or SUBE"); 12690 12691 assert(AddeSubeNode->getNumOperands() == 3 && 12692 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 12693 "ADDE node has the wrong inputs"); 12694 12695 // Check that we are chained to the right ADDC or SUBC node. 12696 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 12697 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 12698 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 12699 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 12700 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 12701 return SDValue(); 12702 12703 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 12704 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 12705 12706 // Check if the two operands are from the same mul_lohi node. 12707 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 12708 return SDValue(); 12709 12710 assert(AddcSubcNode->getNumValues() == 2 && 12711 AddcSubcNode->getValueType(0) == MVT::i32 && 12712 "Expect ADDC with two result values. First: i32"); 12713 12714 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 12715 // maybe a SMLAL which multiplies two 16-bit values. 12716 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 12717 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 12718 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 12719 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 12720 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 12721 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 12722 12723 // Check for the triangle shape. 12724 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 12725 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 12726 12727 // Make sure that the ADDE/SUBE operands are not coming from the same node. 12728 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 12729 return SDValue(); 12730 12731 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 12732 bool IsLeftOperandMUL = false; 12733 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 12734 if (MULOp == SDValue()) 12735 MULOp = findMUL_LOHI(AddeSubeOp1); 12736 else 12737 IsLeftOperandMUL = true; 12738 if (MULOp == SDValue()) 12739 return SDValue(); 12740 12741 // Figure out the right opcode. 12742 unsigned Opc = MULOp->getOpcode(); 12743 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 12744 12745 // Figure out the high and low input values to the MLAL node. 12746 SDValue *HiAddSub = nullptr; 12747 SDValue *LoMul = nullptr; 12748 SDValue *LowAddSub = nullptr; 12749 12750 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 12751 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 12752 return SDValue(); 12753 12754 if (IsLeftOperandMUL) 12755 HiAddSub = &AddeSubeOp1; 12756 else 12757 HiAddSub = &AddeSubeOp0; 12758 12759 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 12760 // whose low result is fed to the ADDC/SUBC we are checking. 12761 12762 if (AddcSubcOp0 == MULOp.getValue(0)) { 12763 LoMul = &AddcSubcOp0; 12764 LowAddSub = &AddcSubcOp1; 12765 } 12766 if (AddcSubcOp1 == MULOp.getValue(0)) { 12767 LoMul = &AddcSubcOp1; 12768 LowAddSub = &AddcSubcOp0; 12769 } 12770 12771 if (!LoMul) 12772 return SDValue(); 12773 12774 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 12775 // the replacement below will create a cycle. 12776 if (AddcSubcNode == HiAddSub->getNode() || 12777 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 12778 return SDValue(); 12779 12780 // Create the merged node. 12781 SelectionDAG &DAG = DCI.DAG; 12782 12783 // Start building operand list. 12784 SmallVector<SDValue, 8> Ops; 12785 Ops.push_back(LoMul->getOperand(0)); 12786 Ops.push_back(LoMul->getOperand(1)); 12787 12788 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 12789 // the case, we must be doing signed multiplication and only use the higher 12790 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 12791 // addition or subtraction with the value of 0x800000. 12792 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 12793 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 12794 LowAddSub->getNode()->getOpcode() == ISD::Constant && 12795 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 12796 0x80000000) { 12797 Ops.push_back(*HiAddSub); 12798 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 12799 FinalOpc = ARMISD::SMMLSR; 12800 } else { 12801 FinalOpc = ARMISD::SMMLAR; 12802 } 12803 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 12804 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 12805 12806 return SDValue(AddeSubeNode, 0); 12807 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 12808 // SMMLS is generated during instruction selection and the rest of this 12809 // function can not handle the case where AddcSubcNode is a SUBC. 12810 return SDValue(); 12811 12812 // Finish building the operand list for {U/S}MLAL 12813 Ops.push_back(*LowAddSub); 12814 Ops.push_back(*HiAddSub); 12815 12816 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 12817 DAG.getVTList(MVT::i32, MVT::i32), Ops); 12818 12819 // Replace the ADDs' nodes uses by the MLA node's values. 12820 SDValue HiMLALResult(MLALNode.getNode(), 1); 12821 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 12822 12823 SDValue LoMLALResult(MLALNode.getNode(), 0); 12824 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 12825 12826 // Return original node to notify the driver to stop replacing. 12827 return SDValue(AddeSubeNode, 0); 12828 } 12829 12830 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 12831 TargetLowering::DAGCombinerInfo &DCI, 12832 const ARMSubtarget *Subtarget) { 12833 // UMAAL is similar to UMLAL except that it adds two unsigned values. 12834 // While trying to combine for the other MLAL nodes, first search for the 12835 // chance to use UMAAL. Check if Addc uses a node which has already 12836 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 12837 // as the addend, and it's handled in PerformUMLALCombine. 12838 12839 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 12840 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 12841 12842 // Check that we have a glued ADDC node. 12843 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 12844 if (AddcNode->getOpcode() != ARMISD::ADDC) 12845 return SDValue(); 12846 12847 // Find the converted UMAAL or quit if it doesn't exist. 12848 SDNode *UmlalNode = nullptr; 12849 SDValue AddHi; 12850 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 12851 UmlalNode = AddcNode->getOperand(0).getNode(); 12852 AddHi = AddcNode->getOperand(1); 12853 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 12854 UmlalNode = AddcNode->getOperand(1).getNode(); 12855 AddHi = AddcNode->getOperand(0); 12856 } else { 12857 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 12858 } 12859 12860 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 12861 // the ADDC as well as Zero. 12862 if (!isNullConstant(UmlalNode->getOperand(3))) 12863 return SDValue(); 12864 12865 if ((isNullConstant(AddeNode->getOperand(0)) && 12866 AddeNode->getOperand(1).getNode() == UmlalNode) || 12867 (AddeNode->getOperand(0).getNode() == UmlalNode && 12868 isNullConstant(AddeNode->getOperand(1)))) { 12869 SelectionDAG &DAG = DCI.DAG; 12870 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 12871 UmlalNode->getOperand(2), AddHi }; 12872 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 12873 DAG.getVTList(MVT::i32, MVT::i32), Ops); 12874 12875 // Replace the ADDs' nodes uses by the UMAAL node's values. 12876 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 12877 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 12878 12879 // Return original node to notify the driver to stop replacing. 12880 return SDValue(AddeNode, 0); 12881 } 12882 return SDValue(); 12883 } 12884 12885 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 12886 const ARMSubtarget *Subtarget) { 12887 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 12888 return SDValue(); 12889 12890 // Check that we have a pair of ADDC and ADDE as operands. 12891 // Both addends of the ADDE must be zero. 12892 SDNode* AddcNode = N->getOperand(2).getNode(); 12893 SDNode* AddeNode = N->getOperand(3).getNode(); 12894 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 12895 (AddeNode->getOpcode() == ARMISD::ADDE) && 12896 isNullConstant(AddeNode->getOperand(0)) && 12897 isNullConstant(AddeNode->getOperand(1)) && 12898 (AddeNode->getOperand(2).getNode() == AddcNode)) 12899 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 12900 DAG.getVTList(MVT::i32, MVT::i32), 12901 {N->getOperand(0), N->getOperand(1), 12902 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 12903 else 12904 return SDValue(); 12905 } 12906 12907 static SDValue PerformAddcSubcCombine(SDNode *N, 12908 TargetLowering::DAGCombinerInfo &DCI, 12909 const ARMSubtarget *Subtarget) { 12910 SelectionDAG &DAG(DCI.DAG); 12911 12912 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) { 12913 // (SUBC (ADDE 0, 0, C), 1) -> C 12914 SDValue LHS = N->getOperand(0); 12915 SDValue RHS = N->getOperand(1); 12916 if (LHS->getOpcode() == ARMISD::ADDE && 12917 isNullConstant(LHS->getOperand(0)) && 12918 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 12919 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 12920 } 12921 } 12922 12923 if (Subtarget->isThumb1Only()) { 12924 SDValue RHS = N->getOperand(1); 12925 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 12926 int32_t imm = C->getSExtValue(); 12927 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 12928 SDLoc DL(N); 12929 RHS = DAG.getConstant(-imm, DL, MVT::i32); 12930 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 12931 : ARMISD::ADDC; 12932 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 12933 } 12934 } 12935 } 12936 12937 return SDValue(); 12938 } 12939 12940 static SDValue PerformAddeSubeCombine(SDNode *N, 12941 TargetLowering::DAGCombinerInfo &DCI, 12942 const ARMSubtarget *Subtarget) { 12943 if (Subtarget->isThumb1Only()) { 12944 SelectionDAG &DAG = DCI.DAG; 12945 SDValue RHS = N->getOperand(1); 12946 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 12947 int64_t imm = C->getSExtValue(); 12948 if (imm < 0) { 12949 SDLoc DL(N); 12950 12951 // The with-carry-in form matches bitwise not instead of the negation. 12952 // Effectively, the inverse interpretation of the carry flag already 12953 // accounts for part of the negation. 12954 RHS = DAG.getConstant(~imm, DL, MVT::i32); 12955 12956 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 12957 : ARMISD::ADDE; 12958 return DAG.getNode(Opcode, DL, N->getVTList(), 12959 N->getOperand(0), RHS, N->getOperand(2)); 12960 } 12961 } 12962 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 12963 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 12964 } 12965 return SDValue(); 12966 } 12967 12968 static SDValue PerformSELECTCombine(SDNode *N, 12969 TargetLowering::DAGCombinerInfo &DCI, 12970 const ARMSubtarget *Subtarget) { 12971 if (!Subtarget->hasMVEIntegerOps()) 12972 return SDValue(); 12973 12974 SDLoc dl(N); 12975 SDValue SetCC; 12976 SDValue LHS; 12977 SDValue RHS; 12978 ISD::CondCode CC; 12979 SDValue TrueVal; 12980 SDValue FalseVal; 12981 12982 if (N->getOpcode() == ISD::SELECT && 12983 N->getOperand(0)->getOpcode() == ISD::SETCC) { 12984 SetCC = N->getOperand(0); 12985 LHS = SetCC->getOperand(0); 12986 RHS = SetCC->getOperand(1); 12987 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); 12988 TrueVal = N->getOperand(1); 12989 FalseVal = N->getOperand(2); 12990 } else if (N->getOpcode() == ISD::SELECT_CC) { 12991 LHS = N->getOperand(0); 12992 RHS = N->getOperand(1); 12993 CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 12994 TrueVal = N->getOperand(2); 12995 FalseVal = N->getOperand(3); 12996 } else { 12997 return SDValue(); 12998 } 12999 13000 unsigned int Opcode = 0; 13001 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN || 13002 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) && 13003 (CC == ISD::SETULT || CC == ISD::SETUGT)) { 13004 Opcode = ARMISD::VMINVu; 13005 if (CC == ISD::SETUGT) 13006 std::swap(TrueVal, FalseVal); 13007 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN || 13008 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) && 13009 (CC == ISD::SETLT || CC == ISD::SETGT)) { 13010 Opcode = ARMISD::VMINVs; 13011 if (CC == ISD::SETGT) 13012 std::swap(TrueVal, FalseVal); 13013 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX || 13014 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) && 13015 (CC == ISD::SETUGT || CC == ISD::SETULT)) { 13016 Opcode = ARMISD::VMAXVu; 13017 if (CC == ISD::SETULT) 13018 std::swap(TrueVal, FalseVal); 13019 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX || 13020 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) && 13021 (CC == ISD::SETGT || CC == ISD::SETLT)) { 13022 Opcode = ARMISD::VMAXVs; 13023 if (CC == ISD::SETLT) 13024 std::swap(TrueVal, FalseVal); 13025 } else 13026 return SDValue(); 13027 13028 // Normalise to the right hand side being the vector reduction 13029 switch (TrueVal->getOpcode()) { 13030 case ISD::VECREDUCE_UMIN: 13031 case ISD::VECREDUCE_SMIN: 13032 case ISD::VECREDUCE_UMAX: 13033 case ISD::VECREDUCE_SMAX: 13034 std::swap(LHS, RHS); 13035 std::swap(TrueVal, FalseVal); 13036 break; 13037 } 13038 13039 EVT VectorType = FalseVal->getOperand(0).getValueType(); 13040 13041 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 && 13042 VectorType != MVT::v4i32) 13043 return SDValue(); 13044 13045 EVT VectorScalarType = VectorType.getVectorElementType(); 13046 13047 // The values being selected must also be the ones being compared 13048 if (TrueVal != LHS || FalseVal != RHS) 13049 return SDValue(); 13050 13051 EVT LeftType = LHS->getValueType(0); 13052 EVT RightType = RHS->getValueType(0); 13053 13054 // The types must match the reduced type too 13055 if (LeftType != VectorScalarType || RightType != VectorScalarType) 13056 return SDValue(); 13057 13058 // Legalise the scalar to an i32 13059 if (VectorScalarType != MVT::i32) 13060 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 13061 13062 // Generate the reduction as an i32 for legalisation purposes 13063 auto Reduction = 13064 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0)); 13065 13066 // The result isn't actually an i32 so truncate it back to its original type 13067 if (VectorScalarType != MVT::i32) 13068 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction); 13069 13070 return Reduction; 13071 } 13072 13073 // A special combine for the vqdmulh family of instructions. This is one of the 13074 // potential set of patterns that could patch this instruction. The base pattern 13075 // you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))). 13076 // This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))), 13077 // which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as 13078 // the max is unnecessary. 13079 static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) { 13080 EVT VT = N->getValueType(0); 13081 SDValue Shft; 13082 ConstantSDNode *Clamp; 13083 13084 if (!VT.isVector() || VT.getScalarSizeInBits() > 64) 13085 return SDValue(); 13086 13087 if (N->getOpcode() == ISD::SMIN) { 13088 Shft = N->getOperand(0); 13089 Clamp = isConstOrConstSplat(N->getOperand(1)); 13090 } else if (N->getOpcode() == ISD::VSELECT) { 13091 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin. 13092 SDValue Cmp = N->getOperand(0); 13093 if (Cmp.getOpcode() != ISD::SETCC || 13094 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT || 13095 Cmp.getOperand(0) != N->getOperand(1) || 13096 Cmp.getOperand(1) != N->getOperand(2)) 13097 return SDValue(); 13098 Shft = N->getOperand(1); 13099 Clamp = isConstOrConstSplat(N->getOperand(2)); 13100 } else 13101 return SDValue(); 13102 13103 if (!Clamp) 13104 return SDValue(); 13105 13106 MVT ScalarType; 13107 int ShftAmt = 0; 13108 switch (Clamp->getSExtValue()) { 13109 case (1 << 7) - 1: 13110 ScalarType = MVT::i8; 13111 ShftAmt = 7; 13112 break; 13113 case (1 << 15) - 1: 13114 ScalarType = MVT::i16; 13115 ShftAmt = 15; 13116 break; 13117 case (1ULL << 31) - 1: 13118 ScalarType = MVT::i32; 13119 ShftAmt = 31; 13120 break; 13121 default: 13122 return SDValue(); 13123 } 13124 13125 if (Shft.getOpcode() != ISD::SRA) 13126 return SDValue(); 13127 ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1)); 13128 if (!N1 || N1->getSExtValue() != ShftAmt) 13129 return SDValue(); 13130 13131 SDValue Mul = Shft.getOperand(0); 13132 if (Mul.getOpcode() != ISD::MUL) 13133 return SDValue(); 13134 13135 SDValue Ext0 = Mul.getOperand(0); 13136 SDValue Ext1 = Mul.getOperand(1); 13137 if (Ext0.getOpcode() != ISD::SIGN_EXTEND || 13138 Ext1.getOpcode() != ISD::SIGN_EXTEND) 13139 return SDValue(); 13140 EVT VecVT = Ext0.getOperand(0).getValueType(); 13141 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1) 13142 return SDValue(); 13143 if (Ext1.getOperand(0).getValueType() != VecVT || 13144 VecVT.getScalarType() != ScalarType || 13145 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2) 13146 return SDValue(); 13147 13148 SDLoc DL(Mul); 13149 unsigned LegalLanes = 128 / (ShftAmt + 1); 13150 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes); 13151 // For types smaller than legal vectors extend to be legal and only use needed 13152 // lanes. 13153 if (VecVT.getSizeInBits() < 128) { 13154 EVT ExtVecVT = 13155 MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT.getVectorNumElements()), 13156 VecVT.getVectorNumElements()); 13157 SDValue Inp0 = 13158 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0)); 13159 SDValue Inp1 = 13160 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0)); 13161 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0); 13162 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1); 13163 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1); 13164 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH); 13165 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc); 13166 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc); 13167 } 13168 13169 // For larger types, split into legal sized chunks. 13170 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type"); 13171 unsigned NumParts = VecVT.getSizeInBits() / 128; 13172 SmallVector<SDValue> Parts; 13173 for (unsigned I = 0; I < NumParts; ++I) { 13174 SDValue Inp0 = 13175 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0), 13176 DAG.getVectorIdxConstant(I * LegalLanes, DL)); 13177 SDValue Inp1 = 13178 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0), 13179 DAG.getVectorIdxConstant(I * LegalLanes, DL)); 13180 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1); 13181 Parts.push_back(VQDMULH); 13182 } 13183 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, 13184 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts)); 13185 } 13186 13187 static SDValue PerformVSELECTCombine(SDNode *N, 13188 TargetLowering::DAGCombinerInfo &DCI, 13189 const ARMSubtarget *Subtarget) { 13190 if (!Subtarget->hasMVEIntegerOps()) 13191 return SDValue(); 13192 13193 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG)) 13194 return V; 13195 13196 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs). 13197 // 13198 // We need to re-implement this optimization here as the implementation in the 13199 // Target-Independent DAGCombiner does not handle the kind of constant we make 13200 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for 13201 // good reason, allowing truncation there would break other targets). 13202 // 13203 // Currently, this is only done for MVE, as it's the only target that benefits 13204 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL). 13205 if (N->getOperand(0).getOpcode() != ISD::XOR) 13206 return SDValue(); 13207 SDValue XOR = N->getOperand(0); 13208 13209 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s. 13210 // It is important to check with truncation allowed as the BUILD_VECTORs we 13211 // generate in those situations will truncate their operands. 13212 ConstantSDNode *Const = 13213 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false, 13214 /*AllowTruncation*/ true); 13215 if (!Const || !Const->isOne()) 13216 return SDValue(); 13217 13218 // Rewrite into vselect(cond, rhs, lhs). 13219 SDValue Cond = XOR->getOperand(0); 13220 SDValue LHS = N->getOperand(1); 13221 SDValue RHS = N->getOperand(2); 13222 EVT Type = N->getValueType(0); 13223 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS); 13224 } 13225 13226 // Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n 13227 static SDValue PerformVSetCCToVCTPCombine(SDNode *N, 13228 TargetLowering::DAGCombinerInfo &DCI, 13229 const ARMSubtarget *Subtarget) { 13230 SDValue Op0 = N->getOperand(0); 13231 SDValue Op1 = N->getOperand(1); 13232 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 13233 EVT VT = N->getValueType(0); 13234 13235 if (!Subtarget->hasMVEIntegerOps() || 13236 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13237 return SDValue(); 13238 13239 if (CC == ISD::SETUGE) { 13240 std::swap(Op0, Op1); 13241 CC = ISD::SETULT; 13242 } 13243 13244 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 || 13245 Op0.getOpcode() != ISD::BUILD_VECTOR) 13246 return SDValue(); 13247 13248 // Check first operand is BuildVector of 0,1,2,... 13249 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) { 13250 if (!Op0.getOperand(I).isUndef() && 13251 !(isa<ConstantSDNode>(Op0.getOperand(I)) && 13252 Op0.getConstantOperandVal(I) == I)) 13253 return SDValue(); 13254 } 13255 13256 // The second is a Splat of Op1S 13257 SDValue Op1S = DCI.DAG.getSplatValue(Op1); 13258 if (!Op1S) 13259 return SDValue(); 13260 13261 unsigned Opc; 13262 switch (VT.getVectorNumElements()) { 13263 case 2: 13264 Opc = Intrinsic::arm_mve_vctp64; 13265 break; 13266 case 4: 13267 Opc = Intrinsic::arm_mve_vctp32; 13268 break; 13269 case 8: 13270 Opc = Intrinsic::arm_mve_vctp16; 13271 break; 13272 case 16: 13273 Opc = Intrinsic::arm_mve_vctp8; 13274 break; 13275 default: 13276 return SDValue(); 13277 } 13278 13279 SDLoc DL(N); 13280 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13281 DCI.DAG.getConstant(Opc, DL, MVT::i32), 13282 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32)); 13283 } 13284 13285 static SDValue PerformABSCombine(SDNode *N, 13286 TargetLowering::DAGCombinerInfo &DCI, 13287 const ARMSubtarget *Subtarget) { 13288 SelectionDAG &DAG = DCI.DAG; 13289 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13290 13291 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 13292 return SDValue(); 13293 13294 return TLI.expandABS(N, DAG); 13295 } 13296 13297 /// PerformADDECombine - Target-specific dag combine transform from 13298 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 13299 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 13300 static SDValue PerformADDECombine(SDNode *N, 13301 TargetLowering::DAGCombinerInfo &DCI, 13302 const ARMSubtarget *Subtarget) { 13303 // Only ARM and Thumb2 support UMLAL/SMLAL. 13304 if (Subtarget->isThumb1Only()) 13305 return PerformAddeSubeCombine(N, DCI, Subtarget); 13306 13307 // Only perform the checks after legalize when the pattern is available. 13308 if (DCI.isBeforeLegalize()) return SDValue(); 13309 13310 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 13311 } 13312 13313 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 13314 /// operands N0 and N1. This is a helper for PerformADDCombine that is 13315 /// called with the default operands, and if that fails, with commuted 13316 /// operands. 13317 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 13318 TargetLowering::DAGCombinerInfo &DCI, 13319 const ARMSubtarget *Subtarget){ 13320 // Attempt to create vpadd for this add. 13321 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 13322 return Result; 13323 13324 // Attempt to create vpaddl for this add. 13325 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 13326 return Result; 13327 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 13328 Subtarget)) 13329 return Result; 13330 13331 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 13332 if (N0.getNode()->hasOneUse()) 13333 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 13334 return Result; 13335 return SDValue(); 13336 } 13337 13338 static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) { 13339 EVT VT = N->getValueType(0); 13340 SDValue N0 = N->getOperand(0); 13341 SDValue N1 = N->getOperand(1); 13342 SDLoc dl(N); 13343 13344 auto IsVecReduce = [](SDValue Op) { 13345 switch (Op.getOpcode()) { 13346 case ISD::VECREDUCE_ADD: 13347 case ARMISD::VADDVs: 13348 case ARMISD::VADDVu: 13349 case ARMISD::VMLAVs: 13350 case ARMISD::VMLAVu: 13351 return true; 13352 } 13353 return false; 13354 }; 13355 13356 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) { 13357 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) -> 13358 // add(add(X, vecreduce(Y)), vecreduce(Z)) 13359 // to make better use of vaddva style instructions. 13360 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) && 13361 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) && 13362 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) { 13363 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0)); 13364 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1)); 13365 } 13366 // And turn add(add(A, reduce(B)), add(C, reduce(D))) -> 13367 // add(add(add(A, C), reduce(B)), reduce(D)) 13368 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD && 13369 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) { 13370 unsigned N0RedOp = 0; 13371 if (!IsVecReduce(N0.getOperand(N0RedOp))) { 13372 N0RedOp = 1; 13373 if (!IsVecReduce(N0.getOperand(N0RedOp))) 13374 return SDValue(); 13375 } 13376 13377 unsigned N1RedOp = 0; 13378 if (!IsVecReduce(N1.getOperand(N1RedOp))) 13379 N1RedOp = 1; 13380 if (!IsVecReduce(N1.getOperand(N1RedOp))) 13381 return SDValue(); 13382 13383 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp), 13384 N1.getOperand(1 - N1RedOp)); 13385 SDValue Add1 = 13386 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp)); 13387 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp)); 13388 } 13389 return SDValue(); 13390 }; 13391 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1)) 13392 return R; 13393 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0)) 13394 return R; 13395 13396 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z))) 13397 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z))) 13398 // by ascending load offsets. This can help cores prefetch if the order of 13399 // loads is more predictable. 13400 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) { 13401 // Check if two reductions are known to load data where one is before/after 13402 // another. Return negative if N0 loads data before N1, positive if N1 is 13403 // before N0 and 0 otherwise if nothing is known. 13404 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) { 13405 // Look through to the first operand of a MUL, for the VMLA case. 13406 // Currently only looks at the first operand, in the hope they are equal. 13407 if (N0.getOpcode() == ISD::MUL) 13408 N0 = N0.getOperand(0); 13409 if (N1.getOpcode() == ISD::MUL) 13410 N1 = N1.getOperand(0); 13411 13412 // Return true if the two operands are loads to the same object and the 13413 // offset of the first is known to be less than the offset of the second. 13414 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0); 13415 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1); 13416 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() || 13417 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() || 13418 Load1->isIndexed()) 13419 return 0; 13420 13421 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG); 13422 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG); 13423 13424 if (!BaseLocDecomp0.getBase() || 13425 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() || 13426 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset()) 13427 return 0; 13428 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset()) 13429 return -1; 13430 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset()) 13431 return 1; 13432 return 0; 13433 }; 13434 13435 SDValue X; 13436 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) { 13437 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) { 13438 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0), 13439 N0.getOperand(1).getOperand(0)); 13440 if (IsBefore < 0) { 13441 X = N0.getOperand(0); 13442 N0 = N0.getOperand(1); 13443 } else if (IsBefore > 0) { 13444 X = N0.getOperand(1); 13445 N0 = N0.getOperand(0); 13446 } else 13447 return SDValue(); 13448 } else if (IsVecReduce(N0.getOperand(0))) { 13449 X = N0.getOperand(1); 13450 N0 = N0.getOperand(0); 13451 } else if (IsVecReduce(N0.getOperand(1))) { 13452 X = N0.getOperand(0); 13453 N0 = N0.getOperand(1); 13454 } else 13455 return SDValue(); 13456 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) && 13457 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) { 13458 // Note this is backward to how you would expect. We create 13459 // add(reduce(load + 16), reduce(load + 0)) so that the 13460 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving 13461 // the X as VADDV(load + 0) 13462 return DAG.getNode(ISD::ADD, dl, VT, N1, N0); 13463 } else 13464 return SDValue(); 13465 13466 if (!IsVecReduce(N0) || !IsVecReduce(N1)) 13467 return SDValue(); 13468 13469 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0) 13470 return SDValue(); 13471 13472 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0) 13473 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1); 13474 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0); 13475 }; 13476 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true)) 13477 return R; 13478 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false)) 13479 return R; 13480 return SDValue(); 13481 } 13482 13483 static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, 13484 const ARMSubtarget *Subtarget) { 13485 if (!Subtarget->hasMVEIntegerOps()) 13486 return SDValue(); 13487 13488 if (SDValue R = TryDistrubutionADDVecReduce(N, DAG)) 13489 return R; 13490 13491 EVT VT = N->getValueType(0); 13492 SDValue N0 = N->getOperand(0); 13493 SDValue N1 = N->getOperand(1); 13494 SDLoc dl(N); 13495 13496 if (VT != MVT::i64) 13497 return SDValue(); 13498 13499 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this 13500 // will look like: 13501 // t1: i32,i32 = ARMISD::VADDLVs x 13502 // t2: i64 = build_pair t1, t1:1 13503 // t3: i64 = add t2, y 13504 // Otherwise we try to push the add up above VADDLVAx, to potentially allow 13505 // the add to be simplified seperately. 13506 // We also need to check for sext / zext and commutitive adds. 13507 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA, 13508 SDValue NB) { 13509 if (NB->getOpcode() != ISD::BUILD_PAIR) 13510 return SDValue(); 13511 SDValue VecRed = NB->getOperand(0); 13512 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) || 13513 VecRed.getResNo() != 0 || 13514 NB->getOperand(1) != SDValue(VecRed.getNode(), 1)) 13515 return SDValue(); 13516 13517 if (VecRed->getOpcode() == OpcodeA) { 13518 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y) 13519 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, 13520 VecRed.getOperand(0), VecRed.getOperand(1)); 13521 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA); 13522 } 13523 13524 SmallVector<SDValue, 4> Ops; 13525 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 13526 DAG.getConstant(0, dl, MVT::i32))); 13527 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 13528 DAG.getConstant(1, dl, MVT::i32))); 13529 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0; 13530 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++) 13531 Ops.push_back(VecRed->getOperand(I)); 13532 SDValue Red = 13533 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops); 13534 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, 13535 SDValue(Red.getNode(), 1)); 13536 }; 13537 13538 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1)) 13539 return M; 13540 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1)) 13541 return M; 13542 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0)) 13543 return M; 13544 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) 13545 return M; 13546 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1)) 13547 return M; 13548 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1)) 13549 return M; 13550 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0)) 13551 return M; 13552 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0)) 13553 return M; 13554 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) 13555 return M; 13556 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) 13557 return M; 13558 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0)) 13559 return M; 13560 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0)) 13561 return M; 13562 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1)) 13563 return M; 13564 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1)) 13565 return M; 13566 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0)) 13567 return M; 13568 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0)) 13569 return M; 13570 return SDValue(); 13571 } 13572 13573 bool 13574 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 13575 CombineLevel Level) const { 13576 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || 13577 N->getOpcode() == ISD::SRL) && 13578 "Expected shift op"); 13579 13580 if (Level == BeforeLegalizeTypes) 13581 return true; 13582 13583 if (N->getOpcode() != ISD::SHL) 13584 return true; 13585 13586 if (Subtarget->isThumb1Only()) { 13587 // Avoid making expensive immediates by commuting shifts. (This logic 13588 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 13589 // for free.) 13590 if (N->getOpcode() != ISD::SHL) 13591 return true; 13592 SDValue N1 = N->getOperand(0); 13593 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 13594 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 13595 return true; 13596 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 13597 if (Const->getAPIntValue().ult(256)) 13598 return false; 13599 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 13600 Const->getAPIntValue().sgt(-256)) 13601 return false; 13602 } 13603 return true; 13604 } 13605 13606 // Turn off commute-with-shift transform after legalization, so it doesn't 13607 // conflict with PerformSHLSimplify. (We could try to detect when 13608 // PerformSHLSimplify would trigger more precisely, but it isn't 13609 // really necessary.) 13610 return false; 13611 } 13612 13613 bool ARMTargetLowering::isDesirableToCommuteXorWithShift( 13614 const SDNode *N) const { 13615 assert(N->getOpcode() == ISD::XOR && 13616 (N->getOperand(0).getOpcode() == ISD::SHL || 13617 N->getOperand(0).getOpcode() == ISD::SRL) && 13618 "Expected XOR(SHIFT) pattern"); 13619 13620 // Only commute if the entire NOT mask is a hidden shifted mask. 13621 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13622 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)); 13623 if (XorC && ShiftC) { 13624 unsigned MaskIdx, MaskLen; 13625 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) { 13626 unsigned ShiftAmt = ShiftC->getZExtValue(); 13627 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); 13628 if (N->getOperand(0).getOpcode() == ISD::SHL) 13629 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt); 13630 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt); 13631 } 13632 } 13633 13634 return false; 13635 } 13636 13637 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 13638 const SDNode *N, CombineLevel Level) const { 13639 assert(((N->getOpcode() == ISD::SHL && 13640 N->getOperand(0).getOpcode() == ISD::SRL) || 13641 (N->getOpcode() == ISD::SRL && 13642 N->getOperand(0).getOpcode() == ISD::SHL)) && 13643 "Expected shift-shift mask"); 13644 13645 if (!Subtarget->isThumb1Only()) 13646 return true; 13647 13648 if (Level == BeforeLegalizeTypes) 13649 return true; 13650 13651 return false; 13652 } 13653 13654 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 13655 if (!Subtarget->hasNEON()) { 13656 if (Subtarget->isThumb1Only()) 13657 return VT.getScalarSizeInBits() <= 32; 13658 return true; 13659 } 13660 return VT.isScalarInteger(); 13661 } 13662 13663 bool ARMTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, 13664 EVT VT) const { 13665 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple()) 13666 return false; 13667 13668 switch (FPVT.getSimpleVT().SimpleTy) { 13669 case MVT::f16: 13670 return Subtarget->hasVFP2Base(); 13671 case MVT::f32: 13672 return Subtarget->hasVFP2Base(); 13673 case MVT::f64: 13674 return Subtarget->hasFP64(); 13675 case MVT::v4f32: 13676 case MVT::v8f16: 13677 return Subtarget->hasMVEFloatOps(); 13678 default: 13679 return false; 13680 } 13681 } 13682 13683 static SDValue PerformSHLSimplify(SDNode *N, 13684 TargetLowering::DAGCombinerInfo &DCI, 13685 const ARMSubtarget *ST) { 13686 // Allow the generic combiner to identify potential bswaps. 13687 if (DCI.isBeforeLegalize()) 13688 return SDValue(); 13689 13690 // DAG combiner will fold: 13691 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 13692 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 13693 // Other code patterns that can be also be modified have the following form: 13694 // b + ((a << 1) | 510) 13695 // b + ((a << 1) & 510) 13696 // b + ((a << 1) ^ 510) 13697 // b + ((a << 1) + 510) 13698 13699 // Many instructions can perform the shift for free, but it requires both 13700 // the operands to be registers. If c1 << c2 is too large, a mov immediate 13701 // instruction will needed. So, unfold back to the original pattern if: 13702 // - if c1 and c2 are small enough that they don't require mov imms. 13703 // - the user(s) of the node can perform an shl 13704 13705 // No shifted operands for 16-bit instructions. 13706 if (ST->isThumb() && ST->isThumb1Only()) 13707 return SDValue(); 13708 13709 // Check that all the users could perform the shl themselves. 13710 for (auto U : N->uses()) { 13711 switch(U->getOpcode()) { 13712 default: 13713 return SDValue(); 13714 case ISD::SUB: 13715 case ISD::ADD: 13716 case ISD::AND: 13717 case ISD::OR: 13718 case ISD::XOR: 13719 case ISD::SETCC: 13720 case ARMISD::CMP: 13721 // Check that the user isn't already using a constant because there 13722 // aren't any instructions that support an immediate operand and a 13723 // shifted operand. 13724 if (isa<ConstantSDNode>(U->getOperand(0)) || 13725 isa<ConstantSDNode>(U->getOperand(1))) 13726 return SDValue(); 13727 13728 // Check that it's not already using a shift. 13729 if (U->getOperand(0).getOpcode() == ISD::SHL || 13730 U->getOperand(1).getOpcode() == ISD::SHL) 13731 return SDValue(); 13732 break; 13733 } 13734 } 13735 13736 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 13737 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 13738 return SDValue(); 13739 13740 if (N->getOperand(0).getOpcode() != ISD::SHL) 13741 return SDValue(); 13742 13743 SDValue SHL = N->getOperand(0); 13744 13745 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13746 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 13747 if (!C1ShlC2 || !C2) 13748 return SDValue(); 13749 13750 APInt C2Int = C2->getAPIntValue(); 13751 APInt C1Int = C1ShlC2->getAPIntValue(); 13752 13753 // Check that performing a lshr will not lose any information. 13754 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 13755 C2Int.getBitWidth() - C2->getZExtValue()); 13756 if ((C1Int & Mask) != C1Int) 13757 return SDValue(); 13758 13759 // Shift the first constant. 13760 C1Int.lshrInPlace(C2Int); 13761 13762 // The immediates are encoded as an 8-bit value that can be rotated. 13763 auto LargeImm = [](const APInt &Imm) { 13764 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 13765 return Imm.getBitWidth() - Zeros > 8; 13766 }; 13767 13768 if (LargeImm(C1Int) || LargeImm(C2Int)) 13769 return SDValue(); 13770 13771 SelectionDAG &DAG = DCI.DAG; 13772 SDLoc dl(N); 13773 SDValue X = SHL.getOperand(0); 13774 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 13775 DAG.getConstant(C1Int, dl, MVT::i32)); 13776 // Shift left to compensate for the lshr of C1Int. 13777 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 13778 13779 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 13780 SHL.dump(); N->dump()); 13781 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 13782 return Res; 13783 } 13784 13785 13786 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 13787 /// 13788 static SDValue PerformADDCombine(SDNode *N, 13789 TargetLowering::DAGCombinerInfo &DCI, 13790 const ARMSubtarget *Subtarget) { 13791 SDValue N0 = N->getOperand(0); 13792 SDValue N1 = N->getOperand(1); 13793 13794 // Only works one way, because it needs an immediate operand. 13795 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 13796 return Result; 13797 13798 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget)) 13799 return Result; 13800 13801 // First try with the default operand order. 13802 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 13803 return Result; 13804 13805 // If that didn't work, try again with the operands commuted. 13806 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 13807 } 13808 13809 // Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC) 13810 // providing -X is as cheap as X (currently, just a constant). 13811 static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) { 13812 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0))) 13813 return SDValue(); 13814 SDValue CSINC = N->getOperand(1); 13815 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse()) 13816 return SDValue(); 13817 13818 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0)); 13819 if (!X) 13820 return SDValue(); 13821 13822 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32, 13823 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0), 13824 CSINC.getOperand(0)), 13825 CSINC.getOperand(1), CSINC.getOperand(2), 13826 CSINC.getOperand(3)); 13827 } 13828 13829 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 13830 /// 13831 static SDValue PerformSUBCombine(SDNode *N, 13832 TargetLowering::DAGCombinerInfo &DCI, 13833 const ARMSubtarget *Subtarget) { 13834 SDValue N0 = N->getOperand(0); 13835 SDValue N1 = N->getOperand(1); 13836 13837 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 13838 if (N1.getNode()->hasOneUse()) 13839 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 13840 return Result; 13841 13842 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG)) 13843 return R; 13844 13845 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) 13846 return SDValue(); 13847 13848 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) 13849 // so that we can readily pattern match more mve instructions which can use 13850 // a scalar operand. 13851 SDValue VDup = N->getOperand(1); 13852 if (VDup->getOpcode() != ARMISD::VDUP) 13853 return SDValue(); 13854 13855 SDValue VMov = N->getOperand(0); 13856 if (VMov->getOpcode() == ISD::BITCAST) 13857 VMov = VMov->getOperand(0); 13858 13859 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) 13860 return SDValue(); 13861 13862 SDLoc dl(N); 13863 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, 13864 DCI.DAG.getConstant(0, dl, MVT::i32), 13865 VDup->getOperand(0)); 13866 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); 13867 } 13868 13869 /// PerformVMULCombine 13870 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 13871 /// special multiplier accumulator forwarding. 13872 /// vmul d3, d0, d2 13873 /// vmla d3, d1, d2 13874 /// is faster than 13875 /// vadd d3, d0, d1 13876 /// vmul d3, d3, d2 13877 // However, for (A + B) * (A + B), 13878 // vadd d2, d0, d1 13879 // vmul d3, d0, d2 13880 // vmla d3, d1, d2 13881 // is slower than 13882 // vadd d2, d0, d1 13883 // vmul d3, d2, d2 13884 static SDValue PerformVMULCombine(SDNode *N, 13885 TargetLowering::DAGCombinerInfo &DCI, 13886 const ARMSubtarget *Subtarget) { 13887 if (!Subtarget->hasVMLxForwarding()) 13888 return SDValue(); 13889 13890 SelectionDAG &DAG = DCI.DAG; 13891 SDValue N0 = N->getOperand(0); 13892 SDValue N1 = N->getOperand(1); 13893 unsigned Opcode = N0.getOpcode(); 13894 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 13895 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 13896 Opcode = N1.getOpcode(); 13897 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 13898 Opcode != ISD::FADD && Opcode != ISD::FSUB) 13899 return SDValue(); 13900 std::swap(N0, N1); 13901 } 13902 13903 if (N0 == N1) 13904 return SDValue(); 13905 13906 EVT VT = N->getValueType(0); 13907 SDLoc DL(N); 13908 SDValue N00 = N0->getOperand(0); 13909 SDValue N01 = N0->getOperand(1); 13910 return DAG.getNode(Opcode, DL, VT, 13911 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 13912 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 13913 } 13914 13915 static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, 13916 const ARMSubtarget *Subtarget) { 13917 EVT VT = N->getValueType(0); 13918 if (VT != MVT::v2i64) 13919 return SDValue(); 13920 13921 SDValue N0 = N->getOperand(0); 13922 SDValue N1 = N->getOperand(1); 13923 13924 auto IsSignExt = [&](SDValue Op) { 13925 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG) 13926 return SDValue(); 13927 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT(); 13928 if (VT.getScalarSizeInBits() == 32) 13929 return Op->getOperand(0); 13930 return SDValue(); 13931 }; 13932 auto IsZeroExt = [&](SDValue Op) { 13933 // Zero extends are a little more awkward. At the point we are matching 13934 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask. 13935 // That might be before of after a bitcast depending on how the and is 13936 // placed. Because this has to look through bitcasts, it is currently only 13937 // supported on LE. 13938 if (!Subtarget->isLittle()) 13939 return SDValue(); 13940 13941 SDValue And = Op; 13942 if (And->getOpcode() == ISD::BITCAST) 13943 And = And->getOperand(0); 13944 if (And->getOpcode() != ISD::AND) 13945 return SDValue(); 13946 SDValue Mask = And->getOperand(1); 13947 if (Mask->getOpcode() == ISD::BITCAST) 13948 Mask = Mask->getOperand(0); 13949 13950 if (Mask->getOpcode() != ISD::BUILD_VECTOR || 13951 Mask.getValueType() != MVT::v4i32) 13952 return SDValue(); 13953 if (isAllOnesConstant(Mask->getOperand(0)) && 13954 isNullConstant(Mask->getOperand(1)) && 13955 isAllOnesConstant(Mask->getOperand(2)) && 13956 isNullConstant(Mask->getOperand(3))) 13957 return And->getOperand(0); 13958 return SDValue(); 13959 }; 13960 13961 SDLoc dl(N); 13962 if (SDValue Op0 = IsSignExt(N0)) { 13963 if (SDValue Op1 = IsSignExt(N1)) { 13964 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); 13965 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); 13966 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a); 13967 } 13968 } 13969 if (SDValue Op0 = IsZeroExt(N0)) { 13970 if (SDValue Op1 = IsZeroExt(N1)) { 13971 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); 13972 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); 13973 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a); 13974 } 13975 } 13976 13977 return SDValue(); 13978 } 13979 13980 static SDValue PerformMULCombine(SDNode *N, 13981 TargetLowering::DAGCombinerInfo &DCI, 13982 const ARMSubtarget *Subtarget) { 13983 SelectionDAG &DAG = DCI.DAG; 13984 13985 EVT VT = N->getValueType(0); 13986 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64) 13987 return PerformMVEVMULLCombine(N, DAG, Subtarget); 13988 13989 if (Subtarget->isThumb1Only()) 13990 return SDValue(); 13991 13992 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13993 return SDValue(); 13994 13995 if (VT.is64BitVector() || VT.is128BitVector()) 13996 return PerformVMULCombine(N, DCI, Subtarget); 13997 if (VT != MVT::i32) 13998 return SDValue(); 13999 14000 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14001 if (!C) 14002 return SDValue(); 14003 14004 int64_t MulAmt = C->getSExtValue(); 14005 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 14006 14007 ShiftAmt = ShiftAmt & (32 - 1); 14008 SDValue V = N->getOperand(0); 14009 SDLoc DL(N); 14010 14011 SDValue Res; 14012 MulAmt >>= ShiftAmt; 14013 14014 if (MulAmt >= 0) { 14015 if (isPowerOf2_32(MulAmt - 1)) { 14016 // (mul x, 2^N + 1) => (add (shl x, N), x) 14017 Res = DAG.getNode(ISD::ADD, DL, VT, 14018 V, 14019 DAG.getNode(ISD::SHL, DL, VT, 14020 V, 14021 DAG.getConstant(Log2_32(MulAmt - 1), DL, 14022 MVT::i32))); 14023 } else if (isPowerOf2_32(MulAmt + 1)) { 14024 // (mul x, 2^N - 1) => (sub (shl x, N), x) 14025 Res = DAG.getNode(ISD::SUB, DL, VT, 14026 DAG.getNode(ISD::SHL, DL, VT, 14027 V, 14028 DAG.getConstant(Log2_32(MulAmt + 1), DL, 14029 MVT::i32)), 14030 V); 14031 } else 14032 return SDValue(); 14033 } else { 14034 uint64_t MulAmtAbs = -MulAmt; 14035 if (isPowerOf2_32(MulAmtAbs + 1)) { 14036 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 14037 Res = DAG.getNode(ISD::SUB, DL, VT, 14038 V, 14039 DAG.getNode(ISD::SHL, DL, VT, 14040 V, 14041 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 14042 MVT::i32))); 14043 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 14044 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 14045 Res = DAG.getNode(ISD::ADD, DL, VT, 14046 V, 14047 DAG.getNode(ISD::SHL, DL, VT, 14048 V, 14049 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 14050 MVT::i32))); 14051 Res = DAG.getNode(ISD::SUB, DL, VT, 14052 DAG.getConstant(0, DL, MVT::i32), Res); 14053 } else 14054 return SDValue(); 14055 } 14056 14057 if (ShiftAmt != 0) 14058 Res = DAG.getNode(ISD::SHL, DL, VT, 14059 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 14060 14061 // Do not add new nodes to DAG combiner worklist. 14062 DCI.CombineTo(N, Res, false); 14063 return SDValue(); 14064 } 14065 14066 static SDValue CombineANDShift(SDNode *N, 14067 TargetLowering::DAGCombinerInfo &DCI, 14068 const ARMSubtarget *Subtarget) { 14069 // Allow DAGCombine to pattern-match before we touch the canonical form. 14070 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 14071 return SDValue(); 14072 14073 if (N->getValueType(0) != MVT::i32) 14074 return SDValue(); 14075 14076 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14077 if (!N1C) 14078 return SDValue(); 14079 14080 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 14081 // Don't transform uxtb/uxth. 14082 if (C1 == 255 || C1 == 65535) 14083 return SDValue(); 14084 14085 SDNode *N0 = N->getOperand(0).getNode(); 14086 if (!N0->hasOneUse()) 14087 return SDValue(); 14088 14089 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 14090 return SDValue(); 14091 14092 bool LeftShift = N0->getOpcode() == ISD::SHL; 14093 14094 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 14095 if (!N01C) 14096 return SDValue(); 14097 14098 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 14099 if (!C2 || C2 >= 32) 14100 return SDValue(); 14101 14102 // Clear irrelevant bits in the mask. 14103 if (LeftShift) 14104 C1 &= (-1U << C2); 14105 else 14106 C1 &= (-1U >> C2); 14107 14108 SelectionDAG &DAG = DCI.DAG; 14109 SDLoc DL(N); 14110 14111 // We have a pattern of the form "(and (shl x, c2) c1)" or 14112 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 14113 // transform to a pair of shifts, to save materializing c1. 14114 14115 // First pattern: right shift, then mask off leading bits. 14116 // FIXME: Use demanded bits? 14117 if (!LeftShift && isMask_32(C1)) { 14118 uint32_t C3 = countLeadingZeros(C1); 14119 if (C2 < C3) { 14120 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 14121 DAG.getConstant(C3 - C2, DL, MVT::i32)); 14122 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 14123 DAG.getConstant(C3, DL, MVT::i32)); 14124 } 14125 } 14126 14127 // First pattern, reversed: left shift, then mask off trailing bits. 14128 if (LeftShift && isMask_32(~C1)) { 14129 uint32_t C3 = countTrailingZeros(C1); 14130 if (C2 < C3) { 14131 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 14132 DAG.getConstant(C3 - C2, DL, MVT::i32)); 14133 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 14134 DAG.getConstant(C3, DL, MVT::i32)); 14135 } 14136 } 14137 14138 // Second pattern: left shift, then mask off leading bits. 14139 // FIXME: Use demanded bits? 14140 if (LeftShift && isShiftedMask_32(C1)) { 14141 uint32_t Trailing = countTrailingZeros(C1); 14142 uint32_t C3 = countLeadingZeros(C1); 14143 if (Trailing == C2 && C2 + C3 < 32) { 14144 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 14145 DAG.getConstant(C2 + C3, DL, MVT::i32)); 14146 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 14147 DAG.getConstant(C3, DL, MVT::i32)); 14148 } 14149 } 14150 14151 // Second pattern, reversed: right shift, then mask off trailing bits. 14152 // FIXME: Handle other patterns of known/demanded bits. 14153 if (!LeftShift && isShiftedMask_32(C1)) { 14154 uint32_t Leading = countLeadingZeros(C1); 14155 uint32_t C3 = countTrailingZeros(C1); 14156 if (Leading == C2 && C2 + C3 < 32) { 14157 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 14158 DAG.getConstant(C2 + C3, DL, MVT::i32)); 14159 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 14160 DAG.getConstant(C3, DL, MVT::i32)); 14161 } 14162 } 14163 14164 // FIXME: Transform "(and (shl x, c2) c1)" -> 14165 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 14166 // c1. 14167 return SDValue(); 14168 } 14169 14170 static SDValue PerformANDCombine(SDNode *N, 14171 TargetLowering::DAGCombinerInfo &DCI, 14172 const ARMSubtarget *Subtarget) { 14173 // Attempt to use immediate-form VBIC 14174 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 14175 SDLoc dl(N); 14176 EVT VT = N->getValueType(0); 14177 SelectionDAG &DAG = DCI.DAG; 14178 14179 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 || 14180 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1) 14181 return SDValue(); 14182 14183 APInt SplatBits, SplatUndef; 14184 unsigned SplatBitSize; 14185 bool HasAnyUndefs; 14186 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 14187 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 14188 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || 14189 SplatBitSize == 64) { 14190 EVT VbicVT; 14191 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), 14192 SplatUndef.getZExtValue(), SplatBitSize, 14193 DAG, dl, VbicVT, VT, OtherModImm); 14194 if (Val.getNode()) { 14195 SDValue Input = 14196 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 14197 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 14198 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 14199 } 14200 } 14201 } 14202 14203 if (!Subtarget->isThumb1Only()) { 14204 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 14205 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 14206 return Result; 14207 14208 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 14209 return Result; 14210 } 14211 14212 if (Subtarget->isThumb1Only()) 14213 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 14214 return Result; 14215 14216 return SDValue(); 14217 } 14218 14219 // Try combining OR nodes to SMULWB, SMULWT. 14220 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 14221 TargetLowering::DAGCombinerInfo &DCI, 14222 const ARMSubtarget *Subtarget) { 14223 if (!Subtarget->hasV6Ops() || 14224 (Subtarget->isThumb() && 14225 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 14226 return SDValue(); 14227 14228 SDValue SRL = OR->getOperand(0); 14229 SDValue SHL = OR->getOperand(1); 14230 14231 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 14232 SRL = OR->getOperand(1); 14233 SHL = OR->getOperand(0); 14234 } 14235 if (!isSRL16(SRL) || !isSHL16(SHL)) 14236 return SDValue(); 14237 14238 // The first operands to the shifts need to be the two results from the 14239 // same smul_lohi node. 14240 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 14241 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 14242 return SDValue(); 14243 14244 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 14245 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 14246 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 14247 return SDValue(); 14248 14249 // Now we have: 14250 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 14251 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 14252 // For SMUWB the 16-bit value will signed extended somehow. 14253 // For SMULWT only the SRA is required. 14254 // Check both sides of SMUL_LOHI 14255 SDValue OpS16 = SMULLOHI->getOperand(0); 14256 SDValue OpS32 = SMULLOHI->getOperand(1); 14257 14258 SelectionDAG &DAG = DCI.DAG; 14259 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 14260 OpS16 = OpS32; 14261 OpS32 = SMULLOHI->getOperand(0); 14262 } 14263 14264 SDLoc dl(OR); 14265 unsigned Opcode = 0; 14266 if (isS16(OpS16, DAG)) 14267 Opcode = ARMISD::SMULWB; 14268 else if (isSRA16(OpS16)) { 14269 Opcode = ARMISD::SMULWT; 14270 OpS16 = OpS16->getOperand(0); 14271 } 14272 else 14273 return SDValue(); 14274 14275 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 14276 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 14277 return SDValue(OR, 0); 14278 } 14279 14280 static SDValue PerformORCombineToBFI(SDNode *N, 14281 TargetLowering::DAGCombinerInfo &DCI, 14282 const ARMSubtarget *Subtarget) { 14283 // BFI is only available on V6T2+ 14284 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 14285 return SDValue(); 14286 14287 EVT VT = N->getValueType(0); 14288 SDValue N0 = N->getOperand(0); 14289 SDValue N1 = N->getOperand(1); 14290 SelectionDAG &DAG = DCI.DAG; 14291 SDLoc DL(N); 14292 // 1) or (and A, mask), val => ARMbfi A, val, mask 14293 // iff (val & mask) == val 14294 // 14295 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 14296 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 14297 // && mask == ~mask2 14298 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 14299 // && ~mask == mask2 14300 // (i.e., copy a bitfield value into another bitfield of the same width) 14301 14302 if (VT != MVT::i32) 14303 return SDValue(); 14304 14305 SDValue N00 = N0.getOperand(0); 14306 14307 // The value and the mask need to be constants so we can verify this is 14308 // actually a bitfield set. If the mask is 0xffff, we can do better 14309 // via a movt instruction, so don't use BFI in that case. 14310 SDValue MaskOp = N0.getOperand(1); 14311 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 14312 if (!MaskC) 14313 return SDValue(); 14314 unsigned Mask = MaskC->getZExtValue(); 14315 if (Mask == 0xffff) 14316 return SDValue(); 14317 SDValue Res; 14318 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 14319 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 14320 if (N1C) { 14321 unsigned Val = N1C->getZExtValue(); 14322 if ((Val & ~Mask) != Val) 14323 return SDValue(); 14324 14325 if (ARM::isBitFieldInvertedMask(Mask)) { 14326 Val >>= countTrailingZeros(~Mask); 14327 14328 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 14329 DAG.getConstant(Val, DL, MVT::i32), 14330 DAG.getConstant(Mask, DL, MVT::i32)); 14331 14332 DCI.CombineTo(N, Res, false); 14333 // Return value from the original node to inform the combiner than N is 14334 // now dead. 14335 return SDValue(N, 0); 14336 } 14337 } else if (N1.getOpcode() == ISD::AND) { 14338 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 14339 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 14340 if (!N11C) 14341 return SDValue(); 14342 unsigned Mask2 = N11C->getZExtValue(); 14343 14344 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 14345 // as is to match. 14346 if (ARM::isBitFieldInvertedMask(Mask) && 14347 (Mask == ~Mask2)) { 14348 // The pack halfword instruction works better for masks that fit it, 14349 // so use that when it's available. 14350 if (Subtarget->hasDSP() && 14351 (Mask == 0xffff || Mask == 0xffff0000)) 14352 return SDValue(); 14353 // 2a 14354 unsigned amt = countTrailingZeros(Mask2); 14355 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 14356 DAG.getConstant(amt, DL, MVT::i32)); 14357 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 14358 DAG.getConstant(Mask, DL, MVT::i32)); 14359 DCI.CombineTo(N, Res, false); 14360 // Return value from the original node to inform the combiner than N is 14361 // now dead. 14362 return SDValue(N, 0); 14363 } else if (ARM::isBitFieldInvertedMask(~Mask) && 14364 (~Mask == Mask2)) { 14365 // The pack halfword instruction works better for masks that fit it, 14366 // so use that when it's available. 14367 if (Subtarget->hasDSP() && 14368 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 14369 return SDValue(); 14370 // 2b 14371 unsigned lsb = countTrailingZeros(Mask); 14372 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 14373 DAG.getConstant(lsb, DL, MVT::i32)); 14374 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 14375 DAG.getConstant(Mask2, DL, MVT::i32)); 14376 DCI.CombineTo(N, Res, false); 14377 // Return value from the original node to inform the combiner than N is 14378 // now dead. 14379 return SDValue(N, 0); 14380 } 14381 } 14382 14383 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 14384 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 14385 ARM::isBitFieldInvertedMask(~Mask)) { 14386 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 14387 // where lsb(mask) == #shamt and masked bits of B are known zero. 14388 SDValue ShAmt = N00.getOperand(1); 14389 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 14390 unsigned LSB = countTrailingZeros(Mask); 14391 if (ShAmtC != LSB) 14392 return SDValue(); 14393 14394 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 14395 DAG.getConstant(~Mask, DL, MVT::i32)); 14396 14397 DCI.CombineTo(N, Res, false); 14398 // Return value from the original node to inform the combiner than N is 14399 // now dead. 14400 return SDValue(N, 0); 14401 } 14402 14403 return SDValue(); 14404 } 14405 14406 static bool isValidMVECond(unsigned CC, bool IsFloat) { 14407 switch (CC) { 14408 case ARMCC::EQ: 14409 case ARMCC::NE: 14410 case ARMCC::LE: 14411 case ARMCC::GT: 14412 case ARMCC::GE: 14413 case ARMCC::LT: 14414 return true; 14415 case ARMCC::HS: 14416 case ARMCC::HI: 14417 return !IsFloat; 14418 default: 14419 return false; 14420 }; 14421 } 14422 14423 static ARMCC::CondCodes getVCMPCondCode(SDValue N) { 14424 if (N->getOpcode() == ARMISD::VCMP) 14425 return (ARMCC::CondCodes)N->getConstantOperandVal(2); 14426 else if (N->getOpcode() == ARMISD::VCMPZ) 14427 return (ARMCC::CondCodes)N->getConstantOperandVal(1); 14428 else 14429 llvm_unreachable("Not a VCMP/VCMPZ!"); 14430 } 14431 14432 static bool CanInvertMVEVCMP(SDValue N) { 14433 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N)); 14434 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint()); 14435 } 14436 14437 static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, 14438 const ARMSubtarget *Subtarget) { 14439 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain 14440 // together with predicates 14441 EVT VT = N->getValueType(0); 14442 SDLoc DL(N); 14443 SDValue N0 = N->getOperand(0); 14444 SDValue N1 = N->getOperand(1); 14445 14446 auto IsFreelyInvertable = [&](SDValue V) { 14447 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ) 14448 return CanInvertMVEVCMP(V); 14449 return false; 14450 }; 14451 14452 // At least one operand must be freely invertable. 14453 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1))) 14454 return SDValue(); 14455 14456 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT); 14457 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT); 14458 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1); 14459 return DAG.getLogicalNOT(DL, And, VT); 14460 } 14461 14462 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 14463 static SDValue PerformORCombine(SDNode *N, 14464 TargetLowering::DAGCombinerInfo &DCI, 14465 const ARMSubtarget *Subtarget) { 14466 // Attempt to use immediate-form VORR 14467 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 14468 SDLoc dl(N); 14469 EVT VT = N->getValueType(0); 14470 SelectionDAG &DAG = DCI.DAG; 14471 14472 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 14473 return SDValue(); 14474 14475 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 || 14476 VT == MVT::v8i1 || VT == MVT::v16i1)) 14477 return PerformORCombine_i1(N, DAG, Subtarget); 14478 14479 APInt SplatBits, SplatUndef; 14480 unsigned SplatBitSize; 14481 bool HasAnyUndefs; 14482 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 14483 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 14484 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || 14485 SplatBitSize == 64) { 14486 EVT VorrVT; 14487 SDValue Val = 14488 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 14489 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm); 14490 if (Val.getNode()) { 14491 SDValue Input = 14492 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 14493 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 14494 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 14495 } 14496 } 14497 } 14498 14499 if (!Subtarget->isThumb1Only()) { 14500 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 14501 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 14502 return Result; 14503 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 14504 return Result; 14505 } 14506 14507 SDValue N0 = N->getOperand(0); 14508 SDValue N1 = N->getOperand(1); 14509 14510 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 14511 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 14512 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 14513 14514 // The code below optimizes (or (and X, Y), Z). 14515 // The AND operand needs to have a single user to make these optimizations 14516 // profitable. 14517 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 14518 return SDValue(); 14519 14520 APInt SplatUndef; 14521 unsigned SplatBitSize; 14522 bool HasAnyUndefs; 14523 14524 APInt SplatBits0, SplatBits1; 14525 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 14526 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 14527 // Ensure that the second operand of both ands are constants 14528 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 14529 HasAnyUndefs) && !HasAnyUndefs) { 14530 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 14531 HasAnyUndefs) && !HasAnyUndefs) { 14532 // Ensure that the bit width of the constants are the same and that 14533 // the splat arguments are logical inverses as per the pattern we 14534 // are trying to simplify. 14535 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 14536 SplatBits0 == ~SplatBits1) { 14537 // Canonicalize the vector type to make instruction selection 14538 // simpler. 14539 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 14540 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT, 14541 N0->getOperand(1), 14542 N0->getOperand(0), 14543 N1->getOperand(0)); 14544 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 14545 } 14546 } 14547 } 14548 } 14549 14550 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 14551 // reasonable. 14552 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 14553 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 14554 return Res; 14555 } 14556 14557 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 14558 return Result; 14559 14560 return SDValue(); 14561 } 14562 14563 static SDValue PerformXORCombine(SDNode *N, 14564 TargetLowering::DAGCombinerInfo &DCI, 14565 const ARMSubtarget *Subtarget) { 14566 EVT VT = N->getValueType(0); 14567 SelectionDAG &DAG = DCI.DAG; 14568 14569 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 14570 return SDValue(); 14571 14572 if (!Subtarget->isThumb1Only()) { 14573 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 14574 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 14575 return Result; 14576 14577 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 14578 return Result; 14579 } 14580 14581 if (Subtarget->hasMVEIntegerOps()) { 14582 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition. 14583 SDValue N0 = N->getOperand(0); 14584 SDValue N1 = N->getOperand(1); 14585 const TargetLowering *TLI = Subtarget->getTargetLowering(); 14586 if (TLI->isConstTrueVal(N1) && 14587 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) { 14588 if (CanInvertMVEVCMP(N0)) { 14589 SDLoc DL(N0); 14590 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0)); 14591 14592 SmallVector<SDValue, 4> Ops; 14593 Ops.push_back(N0->getOperand(0)); 14594 if (N0->getOpcode() == ARMISD::VCMP) 14595 Ops.push_back(N0->getOperand(1)); 14596 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32)); 14597 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops); 14598 } 14599 } 14600 } 14601 14602 return SDValue(); 14603 } 14604 14605 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 14606 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 14607 // their position in "to" (Rd). 14608 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 14609 assert(N->getOpcode() == ARMISD::BFI); 14610 14611 SDValue From = N->getOperand(1); 14612 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 14613 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 14614 14615 // If the Base came from a SHR #C, we can deduce that it is really testing bit 14616 // #C in the base of the SHR. 14617 if (From->getOpcode() == ISD::SRL && 14618 isa<ConstantSDNode>(From->getOperand(1))) { 14619 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 14620 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 14621 FromMask <<= Shift.getLimitedValue(31); 14622 From = From->getOperand(0); 14623 } 14624 14625 return From; 14626 } 14627 14628 // If A and B contain one contiguous set of bits, does A | B == A . B? 14629 // 14630 // Neither A nor B must be zero. 14631 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 14632 unsigned LastActiveBitInA = A.countTrailingZeros(); 14633 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 14634 return LastActiveBitInA - 1 == FirstActiveBitInB; 14635 } 14636 14637 static SDValue FindBFIToCombineWith(SDNode *N) { 14638 // We have a BFI in N. Find a BFI it can combine with, if one exists. 14639 APInt ToMask, FromMask; 14640 SDValue From = ParseBFI(N, ToMask, FromMask); 14641 SDValue To = N->getOperand(0); 14642 14643 SDValue V = To; 14644 if (V.getOpcode() != ARMISD::BFI) 14645 return SDValue(); 14646 14647 APInt NewToMask, NewFromMask; 14648 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 14649 if (NewFrom != From) 14650 return SDValue(); 14651 14652 // Do the written bits conflict with any we've seen so far? 14653 if ((NewToMask & ToMask).getBoolValue()) 14654 // Conflicting bits. 14655 return SDValue(); 14656 14657 // Are the new bits contiguous when combined with the old bits? 14658 if (BitsProperlyConcatenate(ToMask, NewToMask) && 14659 BitsProperlyConcatenate(FromMask, NewFromMask)) 14660 return V; 14661 if (BitsProperlyConcatenate(NewToMask, ToMask) && 14662 BitsProperlyConcatenate(NewFromMask, FromMask)) 14663 return V; 14664 14665 return SDValue(); 14666 } 14667 14668 static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) { 14669 SDValue N0 = N->getOperand(0); 14670 SDValue N1 = N->getOperand(1); 14671 14672 if (N1.getOpcode() == ISD::AND) { 14673 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 14674 // the bits being cleared by the AND are not demanded by the BFI. 14675 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 14676 if (!N11C) 14677 return SDValue(); 14678 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 14679 unsigned LSB = countTrailingZeros(~InvMask); 14680 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 14681 assert(Width < 14682 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 14683 "undefined behavior"); 14684 unsigned Mask = (1u << Width) - 1; 14685 unsigned Mask2 = N11C->getZExtValue(); 14686 if ((Mask & (~Mask2)) == 0) 14687 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 14688 N->getOperand(0), N1.getOperand(0), N->getOperand(2)); 14689 return SDValue(); 14690 } 14691 14692 // Look for another BFI to combine with. 14693 if (SDValue CombineBFI = FindBFIToCombineWith(N)) { 14694 // We've found a BFI. 14695 APInt ToMask1, FromMask1; 14696 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 14697 14698 APInt ToMask2, FromMask2; 14699 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 14700 assert(From1 == From2); 14701 (void)From2; 14702 14703 // Create a new BFI, combining the two together. 14704 APInt NewFromMask = FromMask1 | FromMask2; 14705 APInt NewToMask = ToMask1 | ToMask2; 14706 14707 EVT VT = N->getValueType(0); 14708 SDLoc dl(N); 14709 14710 if (NewFromMask[0] == 0) 14711 From1 = DAG.getNode( 14712 ISD::SRL, dl, VT, From1, 14713 DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 14714 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1, 14715 DAG.getConstant(~NewToMask, dl, VT)); 14716 } 14717 14718 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so 14719 // that lower bit insertions are performed first, providing that M1 and M2 14720 // do no overlap. This can allow multiple BFI instructions to be combined 14721 // together by the other folds above. 14722 if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 14723 APInt ToMask1 = ~N->getConstantOperandAPInt(2); 14724 APInt ToMask2 = ~N0.getConstantOperandAPInt(2); 14725 14726 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 || 14727 ToMask1.countLeadingZeros() < ToMask2.countLeadingZeros()) 14728 return SDValue(); 14729 14730 EVT VT = N->getValueType(0); 14731 SDLoc dl(N); 14732 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0), 14733 N->getOperand(1), N->getOperand(2)); 14734 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1), 14735 N0.getOperand(2)); 14736 } 14737 14738 return SDValue(); 14739 } 14740 14741 // Check that N is CMPZ(CSINC(0, 0, CC, X)), 14742 // or CMPZ(CMOV(1, 0, CC, $cpsr, X)) 14743 // return X if valid. 14744 static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) { 14745 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1))) 14746 return SDValue(); 14747 SDValue CSInc = Cmp->getOperand(0); 14748 14749 // Ignore any `And 1` nodes that may not yet have been removed. We are 14750 // looking for a value that produces 1/0, so these have no effect on the 14751 // code. 14752 while (CSInc.getOpcode() == ISD::AND && 14753 isa<ConstantSDNode>(CSInc.getOperand(1)) && 14754 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse()) 14755 CSInc = CSInc.getOperand(0); 14756 14757 if (CSInc.getOpcode() == ARMISD::CSINC && 14758 isNullConstant(CSInc.getOperand(0)) && 14759 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) { 14760 CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2); 14761 return CSInc.getOperand(3); 14762 } 14763 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) && 14764 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) { 14765 CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2); 14766 return CSInc.getOperand(4); 14767 } 14768 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) && 14769 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) { 14770 CC = ARMCC::getOppositeCondition( 14771 (ARMCC::CondCodes)CSInc.getConstantOperandVal(2)); 14772 return CSInc.getOperand(4); 14773 } 14774 return SDValue(); 14775 } 14776 14777 static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) { 14778 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in 14779 // t92: glue = ARMISD::CMPZ t74, 0 14780 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92 14781 // t96: glue = ARMISD::CMPZ t93, 0 14782 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96 14783 ARMCC::CondCodes Cond; 14784 if (SDValue C = IsCMPZCSINC(N, Cond)) 14785 if (Cond == ARMCC::EQ) 14786 return C; 14787 return SDValue(); 14788 } 14789 14790 static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG) { 14791 // Fold away an unneccessary CMPZ/CSINC 14792 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) -> 14793 // if C1==EQ -> CSXYZ A, B, C2, D 14794 // if C1==NE -> CSXYZ A, B, NOT(C2), D 14795 ARMCC::CondCodes Cond; 14796 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) { 14797 if (N->getConstantOperandVal(2) == ARMCC::EQ) 14798 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), 14799 N->getOperand(1), 14800 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C); 14801 if (N->getConstantOperandVal(2) == ARMCC::NE) 14802 return DAG.getNode( 14803 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), 14804 N->getOperand(1), 14805 DAG.getConstant(ARMCC::getOppositeCondition(Cond), SDLoc(N), MVT::i32), C); 14806 } 14807 return SDValue(); 14808 } 14809 14810 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 14811 /// ARMISD::VMOVRRD. 14812 static SDValue PerformVMOVRRDCombine(SDNode *N, 14813 TargetLowering::DAGCombinerInfo &DCI, 14814 const ARMSubtarget *Subtarget) { 14815 // vmovrrd(vmovdrr x, y) -> x,y 14816 SDValue InDouble = N->getOperand(0); 14817 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) 14818 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 14819 14820 // vmovrrd(load f64) -> (load i32), (load i32) 14821 SDNode *InNode = InDouble.getNode(); 14822 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 14823 InNode->getValueType(0) == MVT::f64 && 14824 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 14825 !cast<LoadSDNode>(InNode)->isVolatile()) { 14826 // TODO: Should this be done for non-FrameIndex operands? 14827 LoadSDNode *LD = cast<LoadSDNode>(InNode); 14828 14829 SelectionDAG &DAG = DCI.DAG; 14830 SDLoc DL(LD); 14831 SDValue BasePtr = LD->getBasePtr(); 14832 SDValue NewLD1 = 14833 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 14834 LD->getAlign(), LD->getMemOperand()->getFlags()); 14835 14836 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 14837 DAG.getConstant(4, DL, MVT::i32)); 14838 14839 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, 14840 LD->getPointerInfo().getWithOffset(4), 14841 commonAlignment(LD->getAlign(), 4), 14842 LD->getMemOperand()->getFlags()); 14843 14844 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 14845 if (DCI.DAG.getDataLayout().isBigEndian()) 14846 std::swap (NewLD1, NewLD2); 14847 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 14848 return Result; 14849 } 14850 14851 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d 14852 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b 14853 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 14854 isa<ConstantSDNode>(InDouble.getOperand(1))) { 14855 SDValue BV = InDouble.getOperand(0); 14856 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may 14857 // change lane order under big endian. 14858 bool BVSwap = BV.getOpcode() == ISD::BITCAST; 14859 while ( 14860 (BV.getOpcode() == ISD::BITCAST || 14861 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) && 14862 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) { 14863 BVSwap = BV.getOpcode() == ISD::BITCAST; 14864 BV = BV.getOperand(0); 14865 } 14866 if (BV.getValueType() != MVT::v4i32) 14867 return SDValue(); 14868 14869 // Handle buildvectors, pulling out the correct lane depending on 14870 // endianness. 14871 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0; 14872 if (BV.getOpcode() == ISD::BUILD_VECTOR) { 14873 SDValue Op0 = BV.getOperand(Offset); 14874 SDValue Op1 = BV.getOperand(Offset + 1); 14875 if (!Subtarget->isLittle() && BVSwap) 14876 std::swap(Op0, Op1); 14877 14878 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); 14879 } 14880 14881 // A chain of insert_vectors, grabbing the correct value of the chain of 14882 // inserts. 14883 SDValue Op0, Op1; 14884 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) { 14885 if (isa<ConstantSDNode>(BV.getOperand(2))) { 14886 if (BV.getConstantOperandVal(2) == Offset) 14887 Op0 = BV.getOperand(1); 14888 if (BV.getConstantOperandVal(2) == Offset + 1) 14889 Op1 = BV.getOperand(1); 14890 } 14891 BV = BV.getOperand(0); 14892 } 14893 if (!Subtarget->isLittle() && BVSwap) 14894 std::swap(Op0, Op1); 14895 if (Op0 && Op1) 14896 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); 14897 } 14898 14899 return SDValue(); 14900 } 14901 14902 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 14903 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 14904 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 14905 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 14906 SDValue Op0 = N->getOperand(0); 14907 SDValue Op1 = N->getOperand(1); 14908 if (Op0.getOpcode() == ISD::BITCAST) 14909 Op0 = Op0.getOperand(0); 14910 if (Op1.getOpcode() == ISD::BITCAST) 14911 Op1 = Op1.getOperand(0); 14912 if (Op0.getOpcode() == ARMISD::VMOVRRD && 14913 Op0.getNode() == Op1.getNode() && 14914 Op0.getResNo() == 0 && Op1.getResNo() == 1) 14915 return DAG.getNode(ISD::BITCAST, SDLoc(N), 14916 N->getValueType(0), Op0.getOperand(0)); 14917 return SDValue(); 14918 } 14919 14920 static SDValue PerformVMOVhrCombine(SDNode *N, 14921 TargetLowering::DAGCombinerInfo &DCI) { 14922 SDValue Op0 = N->getOperand(0); 14923 14924 // VMOVhr (VMOVrh (X)) -> X 14925 if (Op0->getOpcode() == ARMISD::VMOVrh) 14926 return Op0->getOperand(0); 14927 14928 // FullFP16: half values are passed in S-registers, and we don't 14929 // need any of the bitcast and moves: 14930 // 14931 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 14932 // t5: i32 = bitcast t2 14933 // t18: f16 = ARMISD::VMOVhr t5 14934 if (Op0->getOpcode() == ISD::BITCAST) { 14935 SDValue Copy = Op0->getOperand(0); 14936 if (Copy.getValueType() == MVT::f32 && 14937 Copy->getOpcode() == ISD::CopyFromReg) { 14938 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)}; 14939 SDValue NewCopy = 14940 DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops); 14941 return NewCopy; 14942 } 14943 } 14944 14945 // fold (VMOVhr (load x)) -> (load (f16*)x) 14946 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) { 14947 if (LN0->hasOneUse() && LN0->isUnindexed() && 14948 LN0->getMemoryVT() == MVT::i16) { 14949 SDValue Load = 14950 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(), 14951 LN0->getBasePtr(), LN0->getMemOperand()); 14952 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); 14953 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1)); 14954 return Load; 14955 } 14956 } 14957 14958 // Only the bottom 16 bits of the source register are used. 14959 APInt DemandedMask = APInt::getLowBitsSet(32, 16); 14960 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 14961 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI)) 14962 return SDValue(N, 0); 14963 14964 return SDValue(); 14965 } 14966 14967 static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) { 14968 SDValue N0 = N->getOperand(0); 14969 EVT VT = N->getValueType(0); 14970 14971 // fold (VMOVrh (fpconst x)) -> const x 14972 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) { 14973 APFloat V = C->getValueAPF(); 14974 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT); 14975 } 14976 14977 // fold (VMOVrh (load x)) -> (zextload (i16*)x) 14978 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { 14979 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 14980 14981 SDValue Load = 14982 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), 14983 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand()); 14984 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); 14985 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 14986 return Load; 14987 } 14988 14989 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n) 14990 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 14991 isa<ConstantSDNode>(N0->getOperand(1))) 14992 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0), 14993 N0->getOperand(1)); 14994 14995 return SDValue(); 14996 } 14997 14998 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 14999 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 15000 /// i64 vector to have f64 elements, since the value can then be loaded 15001 /// directly into a VFP register. 15002 static bool hasNormalLoadOperand(SDNode *N) { 15003 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 15004 for (unsigned i = 0; i < NumElts; ++i) { 15005 SDNode *Elt = N->getOperand(i).getNode(); 15006 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 15007 return true; 15008 } 15009 return false; 15010 } 15011 15012 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 15013 /// ISD::BUILD_VECTOR. 15014 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 15015 TargetLowering::DAGCombinerInfo &DCI, 15016 const ARMSubtarget *Subtarget) { 15017 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 15018 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 15019 // into a pair of GPRs, which is fine when the value is used as a scalar, 15020 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 15021 SelectionDAG &DAG = DCI.DAG; 15022 if (N->getNumOperands() == 2) 15023 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 15024 return RV; 15025 15026 // Load i64 elements as f64 values so that type legalization does not split 15027 // them up into i32 values. 15028 EVT VT = N->getValueType(0); 15029 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 15030 return SDValue(); 15031 SDLoc dl(N); 15032 SmallVector<SDValue, 8> Ops; 15033 unsigned NumElts = VT.getVectorNumElements(); 15034 for (unsigned i = 0; i < NumElts; ++i) { 15035 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 15036 Ops.push_back(V); 15037 // Make the DAGCombiner fold the bitcast. 15038 DCI.AddToWorklist(V.getNode()); 15039 } 15040 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 15041 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 15042 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 15043 } 15044 15045 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 15046 static SDValue 15047 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 15048 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 15049 // At that time, we may have inserted bitcasts from integer to float. 15050 // If these bitcasts have survived DAGCombine, change the lowering of this 15051 // BUILD_VECTOR in something more vector friendly, i.e., that does not 15052 // force to use floating point types. 15053 15054 // Make sure we can change the type of the vector. 15055 // This is possible iff: 15056 // 1. The vector is only used in a bitcast to a integer type. I.e., 15057 // 1.1. Vector is used only once. 15058 // 1.2. Use is a bit convert to an integer type. 15059 // 2. The size of its operands are 32-bits (64-bits are not legal). 15060 EVT VT = N->getValueType(0); 15061 EVT EltVT = VT.getVectorElementType(); 15062 15063 // Check 1.1. and 2. 15064 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 15065 return SDValue(); 15066 15067 // By construction, the input type must be float. 15068 assert(EltVT == MVT::f32 && "Unexpected type!"); 15069 15070 // Check 1.2. 15071 SDNode *Use = *N->use_begin(); 15072 if (Use->getOpcode() != ISD::BITCAST || 15073 Use->getValueType(0).isFloatingPoint()) 15074 return SDValue(); 15075 15076 // Check profitability. 15077 // Model is, if more than half of the relevant operands are bitcast from 15078 // i32, turn the build_vector into a sequence of insert_vector_elt. 15079 // Relevant operands are everything that is not statically 15080 // (i.e., at compile time) bitcasted. 15081 unsigned NumOfBitCastedElts = 0; 15082 unsigned NumElts = VT.getVectorNumElements(); 15083 unsigned NumOfRelevantElts = NumElts; 15084 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 15085 SDValue Elt = N->getOperand(Idx); 15086 if (Elt->getOpcode() == ISD::BITCAST) { 15087 // Assume only bit cast to i32 will go away. 15088 if (Elt->getOperand(0).getValueType() == MVT::i32) 15089 ++NumOfBitCastedElts; 15090 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 15091 // Constants are statically casted, thus do not count them as 15092 // relevant operands. 15093 --NumOfRelevantElts; 15094 } 15095 15096 // Check if more than half of the elements require a non-free bitcast. 15097 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 15098 return SDValue(); 15099 15100 SelectionDAG &DAG = DCI.DAG; 15101 // Create the new vector type. 15102 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 15103 // Check if the type is legal. 15104 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15105 if (!TLI.isTypeLegal(VecVT)) 15106 return SDValue(); 15107 15108 // Combine: 15109 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 15110 // => BITCAST INSERT_VECTOR_ELT 15111 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 15112 // (BITCAST EN), N. 15113 SDValue Vec = DAG.getUNDEF(VecVT); 15114 SDLoc dl(N); 15115 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 15116 SDValue V = N->getOperand(Idx); 15117 if (V.isUndef()) 15118 continue; 15119 if (V.getOpcode() == ISD::BITCAST && 15120 V->getOperand(0).getValueType() == MVT::i32) 15121 // Fold obvious case. 15122 V = V.getOperand(0); 15123 else { 15124 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 15125 // Make the DAGCombiner fold the bitcasts. 15126 DCI.AddToWorklist(V.getNode()); 15127 } 15128 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 15129 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 15130 } 15131 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 15132 // Make the DAGCombiner fold the bitcasts. 15133 DCI.AddToWorklist(Vec.getNode()); 15134 return Vec; 15135 } 15136 15137 static SDValue 15138 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 15139 EVT VT = N->getValueType(0); 15140 SDValue Op = N->getOperand(0); 15141 SDLoc dl(N); 15142 15143 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) 15144 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { 15145 // If the valuetypes are the same, we can remove the cast entirely. 15146 if (Op->getOperand(0).getValueType() == VT) 15147 return Op->getOperand(0); 15148 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); 15149 } 15150 15151 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce 15152 // more VPNOT which might get folded as else predicates. 15153 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) { 15154 SDValue X = 15155 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); 15156 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 15157 DCI.DAG.getConstant(65535, dl, MVT::i32)); 15158 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C); 15159 } 15160 15161 // Only the bottom 16 bits of the source register are used. 15162 if (Op.getValueType() == MVT::i32) { 15163 APInt DemandedMask = APInt::getLowBitsSet(32, 16); 15164 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 15165 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) 15166 return SDValue(N, 0); 15167 } 15168 return SDValue(); 15169 } 15170 15171 static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, 15172 const ARMSubtarget *ST) { 15173 EVT VT = N->getValueType(0); 15174 SDValue Op = N->getOperand(0); 15175 SDLoc dl(N); 15176 15177 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST 15178 if (ST->isLittle()) 15179 return DAG.getNode(ISD::BITCAST, dl, VT, Op); 15180 15181 // VECTOR_REG_CAST undef -> undef 15182 if (Op.isUndef()) 15183 return DAG.getUNDEF(VT); 15184 15185 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) 15186 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { 15187 // If the valuetypes are the same, we can remove the cast entirely. 15188 if (Op->getOperand(0).getValueType() == VT) 15189 return Op->getOperand(0); 15190 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0)); 15191 } 15192 15193 return SDValue(); 15194 } 15195 15196 static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, 15197 const ARMSubtarget *Subtarget) { 15198 if (!Subtarget->hasMVEIntegerOps()) 15199 return SDValue(); 15200 15201 EVT VT = N->getValueType(0); 15202 SDValue Op0 = N->getOperand(0); 15203 SDValue Op1 = N->getOperand(1); 15204 ARMCC::CondCodes Cond = 15205 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 15206 SDLoc dl(N); 15207 15208 // vcmp X, 0, cc -> vcmpz X, cc 15209 if (isZeroVector(Op1)) 15210 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2)); 15211 15212 unsigned SwappedCond = getSwappedCondition(Cond); 15213 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { 15214 // vcmp 0, X, cc -> vcmpz X, reversed(cc) 15215 if (isZeroVector(Op0)) 15216 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, 15217 DAG.getConstant(SwappedCond, dl, MVT::i32)); 15218 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) 15219 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) 15220 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, 15221 DAG.getConstant(SwappedCond, dl, MVT::i32)); 15222 } 15223 15224 return SDValue(); 15225 } 15226 15227 /// PerformInsertEltCombine - Target-specific dag combine xforms for 15228 /// ISD::INSERT_VECTOR_ELT. 15229 static SDValue PerformInsertEltCombine(SDNode *N, 15230 TargetLowering::DAGCombinerInfo &DCI) { 15231 // Bitcast an i64 load inserted into a vector to f64. 15232 // Otherwise, the i64 value will be legalized to a pair of i32 values. 15233 EVT VT = N->getValueType(0); 15234 SDNode *Elt = N->getOperand(1).getNode(); 15235 if (VT.getVectorElementType() != MVT::i64 || 15236 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 15237 return SDValue(); 15238 15239 SelectionDAG &DAG = DCI.DAG; 15240 SDLoc dl(N); 15241 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 15242 VT.getVectorNumElements()); 15243 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 15244 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 15245 // Make the DAGCombiner fold the bitcasts. 15246 DCI.AddToWorklist(Vec.getNode()); 15247 DCI.AddToWorklist(V.getNode()); 15248 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 15249 Vec, V, N->getOperand(2)); 15250 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 15251 } 15252 15253 // Convert a pair of extracts from the same base vector to a VMOVRRD. Either 15254 // directly or bitcast to an integer if the original is a float vector. 15255 // extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2) 15256 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2) 15257 static SDValue 15258 PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 15259 EVT VT = N->getValueType(0); 15260 SDLoc dl(N); 15261 15262 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 || 15263 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64)) 15264 return SDValue(); 15265 15266 SDValue Ext = SDValue(N, 0); 15267 if (Ext.getOpcode() == ISD::BITCAST && 15268 Ext.getOperand(0).getValueType() == MVT::f32) 15269 Ext = Ext.getOperand(0); 15270 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 15271 !isa<ConstantSDNode>(Ext.getOperand(1)) || 15272 Ext.getConstantOperandVal(1) % 2 != 0) 15273 return SDValue(); 15274 if (Ext->use_size() == 1 && 15275 (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP || 15276 Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP)) 15277 return SDValue(); 15278 15279 SDValue Op0 = Ext.getOperand(0); 15280 EVT VecVT = Op0.getValueType(); 15281 unsigned ResNo = Op0.getResNo(); 15282 unsigned Lane = Ext.getConstantOperandVal(1); 15283 if (VecVT.getVectorNumElements() != 4) 15284 return SDValue(); 15285 15286 // Find another extract, of Lane + 1 15287 auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) { 15288 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 15289 isa<ConstantSDNode>(V->getOperand(1)) && 15290 V->getConstantOperandVal(1) == Lane + 1 && 15291 V->getOperand(0).getResNo() == ResNo; 15292 }); 15293 if (OtherIt == Op0->uses().end()) 15294 return SDValue(); 15295 15296 // For float extracts, we need to be converting to a i32 for both vector 15297 // lanes. 15298 SDValue OtherExt(*OtherIt, 0); 15299 if (OtherExt.getValueType() != MVT::i32) { 15300 if (OtherExt->use_size() != 1 || 15301 OtherExt->use_begin()->getOpcode() != ISD::BITCAST || 15302 OtherExt->use_begin()->getValueType(0) != MVT::i32) 15303 return SDValue(); 15304 OtherExt = SDValue(*OtherExt->use_begin(), 0); 15305 } 15306 15307 // Convert the type to a f64 and extract with a VMOVRRD. 15308 SDValue F64 = DCI.DAG.getNode( 15309 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 15310 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0), 15311 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32)); 15312 SDValue VMOVRRD = 15313 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64); 15314 15315 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1)); 15316 return VMOVRRD; 15317 } 15318 15319 static SDValue PerformExtractEltCombine(SDNode *N, 15320 TargetLowering::DAGCombinerInfo &DCI, 15321 const ARMSubtarget *ST) { 15322 SDValue Op0 = N->getOperand(0); 15323 EVT VT = N->getValueType(0); 15324 SDLoc dl(N); 15325 15326 // extract (vdup x) -> x 15327 if (Op0->getOpcode() == ARMISD::VDUP) { 15328 SDValue X = Op0->getOperand(0); 15329 if (VT == MVT::f16 && X.getValueType() == MVT::i32) 15330 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); 15331 if (VT == MVT::i32 && X.getValueType() == MVT::f16) 15332 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); 15333 if (VT == MVT::f32 && X.getValueType() == MVT::i32) 15334 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X); 15335 15336 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) 15337 X = X->getOperand(0); 15338 if (X.getValueType() == VT) 15339 return X; 15340 } 15341 15342 // extract ARM_BUILD_VECTOR -> x 15343 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR && 15344 isa<ConstantSDNode>(N->getOperand(1)) && 15345 N->getConstantOperandVal(1) < Op0.getNumOperands()) { 15346 return Op0.getOperand(N->getConstantOperandVal(1)); 15347 } 15348 15349 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b 15350 if (Op0.getValueType() == MVT::v4i32 && 15351 isa<ConstantSDNode>(N->getOperand(1)) && 15352 Op0.getOpcode() == ISD::BITCAST && 15353 Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && 15354 Op0.getOperand(0).getValueType() == MVT::v2f64) { 15355 SDValue BV = Op0.getOperand(0); 15356 unsigned Offset = N->getConstantOperandVal(1); 15357 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1); 15358 if (MOV.getOpcode() == ARMISD::VMOVDRR) 15359 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2); 15360 } 15361 15362 // extract x, n; extract x, n+1 -> VMOVRRD x 15363 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) 15364 return R; 15365 15366 // extract (MVETrunc(x)) -> extract x 15367 if (Op0->getOpcode() == ARMISD::MVETRUNC) { 15368 unsigned Idx = N->getConstantOperandVal(1); 15369 unsigned Vec = 15370 Idx / Op0->getOperand(0).getValueType().getVectorNumElements(); 15371 unsigned SubIdx = 15372 Idx % Op0->getOperand(0).getValueType().getVectorNumElements(); 15373 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec), 15374 DCI.DAG.getConstant(SubIdx, dl, MVT::i32)); 15375 } 15376 15377 return SDValue(); 15378 } 15379 15380 static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) { 15381 SDValue Op = N->getOperand(0); 15382 EVT VT = N->getValueType(0); 15383 15384 // sext_inreg(VGETLANEu) -> VGETLANEs 15385 if (Op.getOpcode() == ARMISD::VGETLANEu && 15386 cast<VTSDNode>(N->getOperand(1))->getVT() == 15387 Op.getOperand(0).getValueType().getScalarType()) 15388 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0), 15389 Op.getOperand(1)); 15390 15391 return SDValue(); 15392 } 15393 15394 // When lowering complex nodes that we recognize, like VQDMULH and MULH, we 15395 // can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to 15396 // binop as the shuffles cancel out. 15397 static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) { 15398 EVT VT = N->getValueType(0); 15399 if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT) 15400 return SDValue(); 15401 SDValue Op = N->getOperand(0); 15402 15403 // Looking for binary operators that will have been folded from 15404 // truncates/extends. 15405 switch (Op.getOpcode()) { 15406 case ARMISD::VQDMULH: 15407 case ISD::MULHS: 15408 case ISD::MULHU: 15409 case ISD::ABDS: 15410 case ISD::ABDU: 15411 case ISD::AVGFLOORS: 15412 case ISD::AVGFLOORU: 15413 case ISD::AVGCEILS: 15414 case ISD::AVGCEILU: 15415 break; 15416 default: 15417 return SDValue(); 15418 } 15419 15420 ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0)); 15421 ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1)); 15422 if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() || 15423 !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() || 15424 Op0->getOperand(0).getValueType() != VT) 15425 return SDValue(); 15426 15427 // Check the mask turns into an identity shuffle. 15428 ArrayRef<int> NMask = N->getMask(); 15429 ArrayRef<int> OpMask = Op0->getMask(); 15430 for (int i = 0, e = NMask.size(); i != e; i++) { 15431 if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i) 15432 return SDValue(); 15433 } 15434 15435 return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), 15436 Op0->getOperand(0), Op1->getOperand(0)); 15437 } 15438 15439 static SDValue 15440 PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 15441 SDValue Vec = N->getOperand(0); 15442 SDValue SubVec = N->getOperand(1); 15443 uint64_t IdxVal = N->getConstantOperandVal(2); 15444 EVT VecVT = Vec.getValueType(); 15445 EVT SubVT = SubVec.getValueType(); 15446 15447 // Only do this for legal fixed vector types. 15448 if (!VecVT.isFixedLengthVector() || 15449 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) || 15450 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(SubVT)) 15451 return SDValue(); 15452 15453 // Ignore widening patterns. 15454 if (IdxVal == 0 && Vec.isUndef()) 15455 return SDValue(); 15456 15457 // Subvector must be half the width and an "aligned" insertion. 15458 unsigned NumSubElts = SubVT.getVectorNumElements(); 15459 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() || 15460 (IdxVal != 0 && IdxVal != NumSubElts)) 15461 return SDValue(); 15462 15463 // Fold insert_subvector -> concat_vectors 15464 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi)) 15465 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub) 15466 SDLoc DL(N); 15467 SDValue Lo, Hi; 15468 if (IdxVal == 0) { 15469 Lo = SubVec; 15470 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, 15471 DCI.DAG.getVectorIdxConstant(NumSubElts, DL)); 15472 } else { 15473 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, 15474 DCI.DAG.getVectorIdxConstant(0, DL)); 15475 Hi = SubVec; 15476 } 15477 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi); 15478 } 15479 15480 // shuffle(MVETrunc(x, y)) -> VMOVN(x, y) 15481 static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, 15482 SelectionDAG &DAG) { 15483 SDValue Trunc = N->getOperand(0); 15484 EVT VT = Trunc.getValueType(); 15485 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef()) 15486 return SDValue(); 15487 15488 SDLoc DL(Trunc); 15489 if (isVMOVNTruncMask(N->getMask(), VT, false)) 15490 return DAG.getNode( 15491 ARMISD::VMOVN, DL, VT, 15492 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)), 15493 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)), 15494 DAG.getConstant(1, DL, MVT::i32)); 15495 else if (isVMOVNTruncMask(N->getMask(), VT, true)) 15496 return DAG.getNode( 15497 ARMISD::VMOVN, DL, VT, 15498 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)), 15499 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)), 15500 DAG.getConstant(1, DL, MVT::i32)); 15501 return SDValue(); 15502 } 15503 15504 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 15505 /// ISD::VECTOR_SHUFFLE. 15506 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 15507 if (SDValue R = FlattenVectorShuffle(cast<ShuffleVectorSDNode>(N), DAG)) 15508 return R; 15509 if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG)) 15510 return R; 15511 15512 // The LLVM shufflevector instruction does not require the shuffle mask 15513 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 15514 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 15515 // operands do not match the mask length, they are extended by concatenating 15516 // them with undef vectors. That is probably the right thing for other 15517 // targets, but for NEON it is better to concatenate two double-register 15518 // size vector operands into a single quad-register size vector. Do that 15519 // transformation here: 15520 // shuffle(concat(v1, undef), concat(v2, undef)) -> 15521 // shuffle(concat(v1, v2), undef) 15522 SDValue Op0 = N->getOperand(0); 15523 SDValue Op1 = N->getOperand(1); 15524 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 15525 Op1.getOpcode() != ISD::CONCAT_VECTORS || 15526 Op0.getNumOperands() != 2 || 15527 Op1.getNumOperands() != 2) 15528 return SDValue(); 15529 SDValue Concat0Op1 = Op0.getOperand(1); 15530 SDValue Concat1Op1 = Op1.getOperand(1); 15531 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 15532 return SDValue(); 15533 // Skip the transformation if any of the types are illegal. 15534 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15535 EVT VT = N->getValueType(0); 15536 if (!TLI.isTypeLegal(VT) || 15537 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 15538 !TLI.isTypeLegal(Concat1Op1.getValueType())) 15539 return SDValue(); 15540 15541 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 15542 Op0.getOperand(0), Op1.getOperand(0)); 15543 // Translate the shuffle mask. 15544 SmallVector<int, 16> NewMask; 15545 unsigned NumElts = VT.getVectorNumElements(); 15546 unsigned HalfElts = NumElts/2; 15547 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 15548 for (unsigned n = 0; n < NumElts; ++n) { 15549 int MaskElt = SVN->getMaskElt(n); 15550 int NewElt = -1; 15551 if (MaskElt < (int)HalfElts) 15552 NewElt = MaskElt; 15553 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 15554 NewElt = HalfElts + MaskElt - NumElts; 15555 NewMask.push_back(NewElt); 15556 } 15557 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 15558 DAG.getUNDEF(VT), NewMask); 15559 } 15560 15561 /// Load/store instruction that can be merged with a base address 15562 /// update 15563 struct BaseUpdateTarget { 15564 SDNode *N; 15565 bool isIntrinsic; 15566 bool isStore; 15567 unsigned AddrOpIdx; 15568 }; 15569 15570 struct BaseUpdateUser { 15571 /// Instruction that updates a pointer 15572 SDNode *N; 15573 /// Pointer increment operand 15574 SDValue Inc; 15575 /// Pointer increment value if it is a constant, or 0 otherwise 15576 unsigned ConstInc; 15577 }; 15578 15579 static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, 15580 struct BaseUpdateUser &User, 15581 bool SimpleConstIncOnly, 15582 TargetLowering::DAGCombinerInfo &DCI) { 15583 SelectionDAG &DAG = DCI.DAG; 15584 SDNode *N = Target.N; 15585 MemSDNode *MemN = cast<MemSDNode>(N); 15586 SDLoc dl(N); 15587 15588 // Find the new opcode for the updating load/store. 15589 bool isLoadOp = true; 15590 bool isLaneOp = false; 15591 // Workaround for vst1x and vld1x intrinsics which do not have alignment 15592 // as an operand. 15593 bool hasAlignment = true; 15594 unsigned NewOpc = 0; 15595 unsigned NumVecs = 0; 15596 if (Target.isIntrinsic) { 15597 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 15598 switch (IntNo) { 15599 default: 15600 llvm_unreachable("unexpected intrinsic for Neon base update"); 15601 case Intrinsic::arm_neon_vld1: 15602 NewOpc = ARMISD::VLD1_UPD; 15603 NumVecs = 1; 15604 break; 15605 case Intrinsic::arm_neon_vld2: 15606 NewOpc = ARMISD::VLD2_UPD; 15607 NumVecs = 2; 15608 break; 15609 case Intrinsic::arm_neon_vld3: 15610 NewOpc = ARMISD::VLD3_UPD; 15611 NumVecs = 3; 15612 break; 15613 case Intrinsic::arm_neon_vld4: 15614 NewOpc = ARMISD::VLD4_UPD; 15615 NumVecs = 4; 15616 break; 15617 case Intrinsic::arm_neon_vld1x2: 15618 NewOpc = ARMISD::VLD1x2_UPD; 15619 NumVecs = 2; 15620 hasAlignment = false; 15621 break; 15622 case Intrinsic::arm_neon_vld1x3: 15623 NewOpc = ARMISD::VLD1x3_UPD; 15624 NumVecs = 3; 15625 hasAlignment = false; 15626 break; 15627 case Intrinsic::arm_neon_vld1x4: 15628 NewOpc = ARMISD::VLD1x4_UPD; 15629 NumVecs = 4; 15630 hasAlignment = false; 15631 break; 15632 case Intrinsic::arm_neon_vld2dup: 15633 NewOpc = ARMISD::VLD2DUP_UPD; 15634 NumVecs = 2; 15635 break; 15636 case Intrinsic::arm_neon_vld3dup: 15637 NewOpc = ARMISD::VLD3DUP_UPD; 15638 NumVecs = 3; 15639 break; 15640 case Intrinsic::arm_neon_vld4dup: 15641 NewOpc = ARMISD::VLD4DUP_UPD; 15642 NumVecs = 4; 15643 break; 15644 case Intrinsic::arm_neon_vld2lane: 15645 NewOpc = ARMISD::VLD2LN_UPD; 15646 NumVecs = 2; 15647 isLaneOp = true; 15648 break; 15649 case Intrinsic::arm_neon_vld3lane: 15650 NewOpc = ARMISD::VLD3LN_UPD; 15651 NumVecs = 3; 15652 isLaneOp = true; 15653 break; 15654 case Intrinsic::arm_neon_vld4lane: 15655 NewOpc = ARMISD::VLD4LN_UPD; 15656 NumVecs = 4; 15657 isLaneOp = true; 15658 break; 15659 case Intrinsic::arm_neon_vst1: 15660 NewOpc = ARMISD::VST1_UPD; 15661 NumVecs = 1; 15662 isLoadOp = false; 15663 break; 15664 case Intrinsic::arm_neon_vst2: 15665 NewOpc = ARMISD::VST2_UPD; 15666 NumVecs = 2; 15667 isLoadOp = false; 15668 break; 15669 case Intrinsic::arm_neon_vst3: 15670 NewOpc = ARMISD::VST3_UPD; 15671 NumVecs = 3; 15672 isLoadOp = false; 15673 break; 15674 case Intrinsic::arm_neon_vst4: 15675 NewOpc = ARMISD::VST4_UPD; 15676 NumVecs = 4; 15677 isLoadOp = false; 15678 break; 15679 case Intrinsic::arm_neon_vst2lane: 15680 NewOpc = ARMISD::VST2LN_UPD; 15681 NumVecs = 2; 15682 isLoadOp = false; 15683 isLaneOp = true; 15684 break; 15685 case Intrinsic::arm_neon_vst3lane: 15686 NewOpc = ARMISD::VST3LN_UPD; 15687 NumVecs = 3; 15688 isLoadOp = false; 15689 isLaneOp = true; 15690 break; 15691 case Intrinsic::arm_neon_vst4lane: 15692 NewOpc = ARMISD::VST4LN_UPD; 15693 NumVecs = 4; 15694 isLoadOp = false; 15695 isLaneOp = true; 15696 break; 15697 case Intrinsic::arm_neon_vst1x2: 15698 NewOpc = ARMISD::VST1x2_UPD; 15699 NumVecs = 2; 15700 isLoadOp = false; 15701 hasAlignment = false; 15702 break; 15703 case Intrinsic::arm_neon_vst1x3: 15704 NewOpc = ARMISD::VST1x3_UPD; 15705 NumVecs = 3; 15706 isLoadOp = false; 15707 hasAlignment = false; 15708 break; 15709 case Intrinsic::arm_neon_vst1x4: 15710 NewOpc = ARMISD::VST1x4_UPD; 15711 NumVecs = 4; 15712 isLoadOp = false; 15713 hasAlignment = false; 15714 break; 15715 } 15716 } else { 15717 isLaneOp = true; 15718 switch (N->getOpcode()) { 15719 default: 15720 llvm_unreachable("unexpected opcode for Neon base update"); 15721 case ARMISD::VLD1DUP: 15722 NewOpc = ARMISD::VLD1DUP_UPD; 15723 NumVecs = 1; 15724 break; 15725 case ARMISD::VLD2DUP: 15726 NewOpc = ARMISD::VLD2DUP_UPD; 15727 NumVecs = 2; 15728 break; 15729 case ARMISD::VLD3DUP: 15730 NewOpc = ARMISD::VLD3DUP_UPD; 15731 NumVecs = 3; 15732 break; 15733 case ARMISD::VLD4DUP: 15734 NewOpc = ARMISD::VLD4DUP_UPD; 15735 NumVecs = 4; 15736 break; 15737 case ISD::LOAD: 15738 NewOpc = ARMISD::VLD1_UPD; 15739 NumVecs = 1; 15740 isLaneOp = false; 15741 break; 15742 case ISD::STORE: 15743 NewOpc = ARMISD::VST1_UPD; 15744 NumVecs = 1; 15745 isLaneOp = false; 15746 isLoadOp = false; 15747 break; 15748 } 15749 } 15750 15751 // Find the size of memory referenced by the load/store. 15752 EVT VecTy; 15753 if (isLoadOp) { 15754 VecTy = N->getValueType(0); 15755 } else if (Target.isIntrinsic) { 15756 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType(); 15757 } else { 15758 assert(Target.isStore && 15759 "Node has to be a load, a store, or an intrinsic!"); 15760 VecTy = N->getOperand(1).getValueType(); 15761 } 15762 15763 bool isVLDDUPOp = 15764 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD || 15765 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD; 15766 15767 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 15768 if (isLaneOp || isVLDDUPOp) 15769 NumBytes /= VecTy.getVectorNumElements(); 15770 15771 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) { 15772 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 15773 // separate instructions that make it harder to use a non-constant update. 15774 return false; 15775 } 15776 15777 if (SimpleConstIncOnly && User.ConstInc != NumBytes) 15778 return false; 15779 15780 // OK, we found an ADD we can fold into the base update. 15781 // Now, create a _UPD node, taking care of not breaking alignment. 15782 15783 EVT AlignedVecTy = VecTy; 15784 Align Alignment = MemN->getAlign(); 15785 15786 // If this is a less-than-standard-aligned load/store, change the type to 15787 // match the standard alignment. 15788 // The alignment is overlooked when selecting _UPD variants; and it's 15789 // easier to introduce bitcasts here than fix that. 15790 // There are 3 ways to get to this base-update combine: 15791 // - intrinsics: they are assumed to be properly aligned (to the standard 15792 // alignment of the memory type), so we don't need to do anything. 15793 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 15794 // intrinsics, so, likewise, there's nothing to do. 15795 // - generic load/store instructions: the alignment is specified as an 15796 // explicit operand, rather than implicitly as the standard alignment 15797 // of the memory type (like the intrisics). We need to change the 15798 // memory type to match the explicit alignment. That way, we don't 15799 // generate non-standard-aligned ARMISD::VLDx nodes. 15800 if (isa<LSBaseSDNode>(N)) { 15801 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) { 15802 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8); 15803 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 15804 assert(!isLaneOp && "Unexpected generic load/store lane."); 15805 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 15806 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 15807 } 15808 // Don't set an explicit alignment on regular load/stores that we want 15809 // to transform to VLD/VST 1_UPD nodes. 15810 // This matches the behavior of regular load/stores, which only get an 15811 // explicit alignment if the MMO alignment is larger than the standard 15812 // alignment of the memory type. 15813 // Intrinsics, however, always get an explicit alignment, set to the 15814 // alignment of the MMO. 15815 Alignment = Align(1); 15816 } 15817 15818 // Create the new updating load/store node. 15819 // First, create an SDVTList for the new updating node's results. 15820 EVT Tys[6]; 15821 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 15822 unsigned n; 15823 for (n = 0; n < NumResultVecs; ++n) 15824 Tys[n] = AlignedVecTy; 15825 Tys[n++] = MVT::i32; 15826 Tys[n] = MVT::Other; 15827 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 15828 15829 // Then, gather the new node's operands. 15830 SmallVector<SDValue, 8> Ops; 15831 Ops.push_back(N->getOperand(0)); // incoming chain 15832 Ops.push_back(N->getOperand(Target.AddrOpIdx)); 15833 Ops.push_back(User.Inc); 15834 15835 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 15836 // Try to match the intrinsic's signature 15837 Ops.push_back(StN->getValue()); 15838 } else { 15839 // Loads (and of course intrinsics) match the intrinsics' signature, 15840 // so just add all but the alignment operand. 15841 unsigned LastOperand = 15842 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands(); 15843 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i) 15844 Ops.push_back(N->getOperand(i)); 15845 } 15846 15847 // For all node types, the alignment operand is always the last one. 15848 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32)); 15849 15850 // If this is a non-standard-aligned STORE, the penultimate operand is the 15851 // stored value. Bitcast it to the aligned type. 15852 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 15853 SDValue &StVal = Ops[Ops.size() - 2]; 15854 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 15855 } 15856 15857 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 15858 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 15859 MemN->getMemOperand()); 15860 15861 // Update the uses. 15862 SmallVector<SDValue, 5> NewResults; 15863 for (unsigned i = 0; i < NumResultVecs; ++i) 15864 NewResults.push_back(SDValue(UpdN.getNode(), i)); 15865 15866 // If this is an non-standard-aligned LOAD, the first result is the loaded 15867 // value. Bitcast it to the expected result type. 15868 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 15869 SDValue &LdVal = NewResults[0]; 15870 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 15871 } 15872 15873 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain 15874 DCI.CombineTo(N, NewResults); 15875 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs)); 15876 15877 return true; 15878 } 15879 15880 // If (opcode ptr inc) is and ADD-like instruction, return the 15881 // increment value. Otherwise return 0. 15882 static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, 15883 SDValue Inc, const SelectionDAG &DAG) { 15884 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 15885 if (!CInc) 15886 return 0; 15887 15888 switch (Opcode) { 15889 case ARMISD::VLD1_UPD: 15890 case ISD::ADD: 15891 return CInc->getZExtValue(); 15892 case ISD::OR: { 15893 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) { 15894 // (OR ptr inc) is the same as (ADD ptr inc) 15895 return CInc->getZExtValue(); 15896 } 15897 return 0; 15898 } 15899 default: 15900 return 0; 15901 } 15902 } 15903 15904 static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) { 15905 switch (N->getOpcode()) { 15906 case ISD::ADD: 15907 case ISD::OR: { 15908 if (isa<ConstantSDNode>(N->getOperand(1))) { 15909 *Ptr = N->getOperand(0); 15910 *CInc = N->getOperand(1); 15911 return true; 15912 } 15913 return false; 15914 } 15915 case ARMISD::VLD1_UPD: { 15916 if (isa<ConstantSDNode>(N->getOperand(2))) { 15917 *Ptr = N->getOperand(1); 15918 *CInc = N->getOperand(2); 15919 return true; 15920 } 15921 return false; 15922 } 15923 default: 15924 return false; 15925 } 15926 } 15927 15928 static bool isValidBaseUpdate(SDNode *N, SDNode *User) { 15929 // Check that the add is independent of the load/store. 15930 // Otherwise, folding it would create a cycle. Search through Addr 15931 // as well, since the User may not be a direct user of Addr and 15932 // only share a base pointer. 15933 SmallPtrSet<const SDNode *, 32> Visited; 15934 SmallVector<const SDNode *, 16> Worklist; 15935 Worklist.push_back(N); 15936 Worklist.push_back(User); 15937 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 15938 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 15939 return false; 15940 return true; 15941 } 15942 15943 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 15944 /// NEON load/store intrinsics, and generic vector load/stores, to merge 15945 /// base address updates. 15946 /// For generic load/stores, the memory type is assumed to be a vector. 15947 /// The caller is assumed to have checked legality. 15948 static SDValue CombineBaseUpdate(SDNode *N, 15949 TargetLowering::DAGCombinerInfo &DCI) { 15950 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 15951 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 15952 const bool isStore = N->getOpcode() == ISD::STORE; 15953 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 15954 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx}; 15955 15956 SDValue Addr = N->getOperand(AddrOpIdx); 15957 15958 SmallVector<BaseUpdateUser, 8> BaseUpdates; 15959 15960 // Search for a use of the address operand that is an increment. 15961 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 15962 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 15963 SDNode *User = *UI; 15964 if (UI.getUse().getResNo() != Addr.getResNo() || 15965 User->getNumOperands() != 2) 15966 continue; 15967 15968 SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1); 15969 unsigned ConstInc = 15970 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG); 15971 15972 if (ConstInc || User->getOpcode() == ISD::ADD) 15973 BaseUpdates.push_back({User, Inc, ConstInc}); 15974 } 15975 15976 // If the address is a constant pointer increment itself, find 15977 // another constant increment that has the same base operand 15978 SDValue Base; 15979 SDValue CInc; 15980 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) { 15981 unsigned Offset = 15982 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG); 15983 for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end(); 15984 UI != UE; ++UI) { 15985 15986 SDNode *User = *UI; 15987 if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() || 15988 User->getNumOperands() != 2) 15989 continue; 15990 15991 SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0); 15992 unsigned UserOffset = 15993 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG); 15994 15995 if (!UserOffset || UserOffset <= Offset) 15996 continue; 15997 15998 unsigned NewConstInc = UserOffset - Offset; 15999 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32); 16000 BaseUpdates.push_back({User, NewInc, NewConstInc}); 16001 } 16002 } 16003 16004 // Try to fold the load/store with an update that matches memory 16005 // access size. This should work well for sequential loads. 16006 // 16007 // Filter out invalid updates as well. 16008 unsigned NumValidUpd = BaseUpdates.size(); 16009 for (unsigned I = 0; I < NumValidUpd;) { 16010 BaseUpdateUser &User = BaseUpdates[I]; 16011 if (!isValidBaseUpdate(N, User.N)) { 16012 --NumValidUpd; 16013 std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]); 16014 continue; 16015 } 16016 16017 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI)) 16018 return SDValue(); 16019 ++I; 16020 } 16021 BaseUpdates.resize(NumValidUpd); 16022 16023 // Try to fold with other users. Non-constant updates are considered 16024 // first, and constant updates are sorted to not break a sequence of 16025 // strided accesses (if there is any). 16026 std::stable_sort(BaseUpdates.begin(), BaseUpdates.end(), 16027 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) { 16028 return LHS.ConstInc < RHS.ConstInc; 16029 }); 16030 for (BaseUpdateUser &User : BaseUpdates) { 16031 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI)) 16032 return SDValue(); 16033 } 16034 return SDValue(); 16035 } 16036 16037 static SDValue PerformVLDCombine(SDNode *N, 16038 TargetLowering::DAGCombinerInfo &DCI) { 16039 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 16040 return SDValue(); 16041 16042 return CombineBaseUpdate(N, DCI); 16043 } 16044 16045 static SDValue PerformMVEVLDCombine(SDNode *N, 16046 TargetLowering::DAGCombinerInfo &DCI) { 16047 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 16048 return SDValue(); 16049 16050 SelectionDAG &DAG = DCI.DAG; 16051 SDValue Addr = N->getOperand(2); 16052 MemSDNode *MemN = cast<MemSDNode>(N); 16053 SDLoc dl(N); 16054 16055 // For the stores, where there are multiple intrinsics we only actually want 16056 // to post-inc the last of the them. 16057 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 16058 if (IntNo == Intrinsic::arm_mve_vst2q && 16059 cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1) 16060 return SDValue(); 16061 if (IntNo == Intrinsic::arm_mve_vst4q && 16062 cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3) 16063 return SDValue(); 16064 16065 // Search for a use of the address operand that is an increment. 16066 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 16067 UE = Addr.getNode()->use_end(); 16068 UI != UE; ++UI) { 16069 SDNode *User = *UI; 16070 if (User->getOpcode() != ISD::ADD || 16071 UI.getUse().getResNo() != Addr.getResNo()) 16072 continue; 16073 16074 // Check that the add is independent of the load/store. Otherwise, folding 16075 // it would create a cycle. We can avoid searching through Addr as it's a 16076 // predecessor to both. 16077 SmallPtrSet<const SDNode *, 32> Visited; 16078 SmallVector<const SDNode *, 16> Worklist; 16079 Visited.insert(Addr.getNode()); 16080 Worklist.push_back(N); 16081 Worklist.push_back(User); 16082 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 16083 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 16084 continue; 16085 16086 // Find the new opcode for the updating load/store. 16087 bool isLoadOp = true; 16088 unsigned NewOpc = 0; 16089 unsigned NumVecs = 0; 16090 switch (IntNo) { 16091 default: 16092 llvm_unreachable("unexpected intrinsic for MVE VLDn combine"); 16093 case Intrinsic::arm_mve_vld2q: 16094 NewOpc = ARMISD::VLD2_UPD; 16095 NumVecs = 2; 16096 break; 16097 case Intrinsic::arm_mve_vld4q: 16098 NewOpc = ARMISD::VLD4_UPD; 16099 NumVecs = 4; 16100 break; 16101 case Intrinsic::arm_mve_vst2q: 16102 NewOpc = ARMISD::VST2_UPD; 16103 NumVecs = 2; 16104 isLoadOp = false; 16105 break; 16106 case Intrinsic::arm_mve_vst4q: 16107 NewOpc = ARMISD::VST4_UPD; 16108 NumVecs = 4; 16109 isLoadOp = false; 16110 break; 16111 } 16112 16113 // Find the size of memory referenced by the load/store. 16114 EVT VecTy; 16115 if (isLoadOp) { 16116 VecTy = N->getValueType(0); 16117 } else { 16118 VecTy = N->getOperand(3).getValueType(); 16119 } 16120 16121 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 16122 16123 // If the increment is a constant, it must match the memory ref size. 16124 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 16125 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 16126 if (!CInc || CInc->getZExtValue() != NumBytes) 16127 continue; 16128 16129 // Create the new updating load/store node. 16130 // First, create an SDVTList for the new updating node's results. 16131 EVT Tys[6]; 16132 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 16133 unsigned n; 16134 for (n = 0; n < NumResultVecs; ++n) 16135 Tys[n] = VecTy; 16136 Tys[n++] = MVT::i32; 16137 Tys[n] = MVT::Other; 16138 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 16139 16140 // Then, gather the new node's operands. 16141 SmallVector<SDValue, 8> Ops; 16142 Ops.push_back(N->getOperand(0)); // incoming chain 16143 Ops.push_back(N->getOperand(2)); // ptr 16144 Ops.push_back(Inc); 16145 16146 for (unsigned i = 3; i < N->getNumOperands(); ++i) 16147 Ops.push_back(N->getOperand(i)); 16148 16149 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy, 16150 MemN->getMemOperand()); 16151 16152 // Update the uses. 16153 SmallVector<SDValue, 5> NewResults; 16154 for (unsigned i = 0; i < NumResultVecs; ++i) 16155 NewResults.push_back(SDValue(UpdN.getNode(), i)); 16156 16157 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain 16158 DCI.CombineTo(N, NewResults); 16159 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 16160 16161 break; 16162 } 16163 16164 return SDValue(); 16165 } 16166 16167 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 16168 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 16169 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 16170 /// return true. 16171 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 16172 SelectionDAG &DAG = DCI.DAG; 16173 EVT VT = N->getValueType(0); 16174 // vldN-dup instructions only support 64-bit vectors for N > 1. 16175 if (!VT.is64BitVector()) 16176 return false; 16177 16178 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 16179 SDNode *VLD = N->getOperand(0).getNode(); 16180 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 16181 return false; 16182 unsigned NumVecs = 0; 16183 unsigned NewOpc = 0; 16184 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 16185 if (IntNo == Intrinsic::arm_neon_vld2lane) { 16186 NumVecs = 2; 16187 NewOpc = ARMISD::VLD2DUP; 16188 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 16189 NumVecs = 3; 16190 NewOpc = ARMISD::VLD3DUP; 16191 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 16192 NumVecs = 4; 16193 NewOpc = ARMISD::VLD4DUP; 16194 } else { 16195 return false; 16196 } 16197 16198 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 16199 // numbers match the load. 16200 unsigned VLDLaneNo = 16201 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 16202 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 16203 UI != UE; ++UI) { 16204 // Ignore uses of the chain result. 16205 if (UI.getUse().getResNo() == NumVecs) 16206 continue; 16207 SDNode *User = *UI; 16208 if (User->getOpcode() != ARMISD::VDUPLANE || 16209 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 16210 return false; 16211 } 16212 16213 // Create the vldN-dup node. 16214 EVT Tys[5]; 16215 unsigned n; 16216 for (n = 0; n < NumVecs; ++n) 16217 Tys[n] = VT; 16218 Tys[n] = MVT::Other; 16219 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 16220 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 16221 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 16222 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 16223 Ops, VLDMemInt->getMemoryVT(), 16224 VLDMemInt->getMemOperand()); 16225 16226 // Update the uses. 16227 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 16228 UI != UE; ++UI) { 16229 unsigned ResNo = UI.getUse().getResNo(); 16230 // Ignore uses of the chain result. 16231 if (ResNo == NumVecs) 16232 continue; 16233 SDNode *User = *UI; 16234 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 16235 } 16236 16237 // Now the vldN-lane intrinsic is dead except for its chain result. 16238 // Update uses of the chain. 16239 std::vector<SDValue> VLDDupResults; 16240 for (unsigned n = 0; n < NumVecs; ++n) 16241 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 16242 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 16243 DCI.CombineTo(VLD, VLDDupResults); 16244 16245 return true; 16246 } 16247 16248 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 16249 /// ARMISD::VDUPLANE. 16250 static SDValue PerformVDUPLANECombine(SDNode *N, 16251 TargetLowering::DAGCombinerInfo &DCI, 16252 const ARMSubtarget *Subtarget) { 16253 SDValue Op = N->getOperand(0); 16254 EVT VT = N->getValueType(0); 16255 16256 // On MVE, we just convert the VDUPLANE to a VDUP with an extract. 16257 if (Subtarget->hasMVEIntegerOps()) { 16258 EVT ExtractVT = VT.getVectorElementType(); 16259 // We need to ensure we are creating a legal type. 16260 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT)) 16261 ExtractVT = MVT::i32; 16262 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT, 16263 N->getOperand(0), N->getOperand(1)); 16264 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract); 16265 } 16266 16267 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 16268 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 16269 if (CombineVLDDUP(N, DCI)) 16270 return SDValue(N, 0); 16271 16272 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 16273 // redundant. Ignore bit_converts for now; element sizes are checked below. 16274 while (Op.getOpcode() == ISD::BITCAST) 16275 Op = Op.getOperand(0); 16276 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 16277 return SDValue(); 16278 16279 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 16280 unsigned EltSize = Op.getScalarValueSizeInBits(); 16281 // The canonical VMOV for a zero vector uses a 32-bit element size. 16282 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 16283 unsigned EltBits; 16284 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) 16285 EltSize = 8; 16286 if (EltSize > VT.getScalarSizeInBits()) 16287 return SDValue(); 16288 16289 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 16290 } 16291 16292 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 16293 static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, 16294 const ARMSubtarget *Subtarget) { 16295 SDValue Op = N->getOperand(0); 16296 SDLoc dl(N); 16297 16298 if (Subtarget->hasMVEIntegerOps()) { 16299 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will 16300 // need to come from a GPR. 16301 if (Op.getValueType() == MVT::f32) 16302 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), 16303 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op)); 16304 else if (Op.getValueType() == MVT::f16) 16305 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), 16306 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op)); 16307 } 16308 16309 if (!Subtarget->hasNEON()) 16310 return SDValue(); 16311 16312 // Match VDUP(LOAD) -> VLD1DUP. 16313 // We match this pattern here rather than waiting for isel because the 16314 // transform is only legal for unindexed loads. 16315 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 16316 if (LD && Op.hasOneUse() && LD->isUnindexed() && 16317 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 16318 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1), 16319 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)}; 16320 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 16321 SDValue VLDDup = 16322 DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops, 16323 LD->getMemoryVT(), LD->getMemOperand()); 16324 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 16325 return VLDDup; 16326 } 16327 16328 return SDValue(); 16329 } 16330 16331 static SDValue PerformLOADCombine(SDNode *N, 16332 TargetLowering::DAGCombinerInfo &DCI, 16333 const ARMSubtarget *Subtarget) { 16334 EVT VT = N->getValueType(0); 16335 16336 // If this is a legal vector load, try to combine it into a VLD1_UPD. 16337 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() && 16338 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 16339 return CombineBaseUpdate(N, DCI); 16340 16341 return SDValue(); 16342 } 16343 16344 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 16345 // pack all of the elements in one place. Next, store to memory in fewer 16346 // chunks. 16347 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, 16348 SelectionDAG &DAG) { 16349 SDValue StVal = St->getValue(); 16350 EVT VT = StVal.getValueType(); 16351 if (!St->isTruncatingStore() || !VT.isVector()) 16352 return SDValue(); 16353 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16354 EVT StVT = St->getMemoryVT(); 16355 unsigned NumElems = VT.getVectorNumElements(); 16356 assert(StVT != VT && "Cannot truncate to the same type"); 16357 unsigned FromEltSz = VT.getScalarSizeInBits(); 16358 unsigned ToEltSz = StVT.getScalarSizeInBits(); 16359 16360 // From, To sizes and ElemCount must be pow of two 16361 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) 16362 return SDValue(); 16363 16364 // We are going to use the original vector elt for storing. 16365 // Accumulated smaller vector elements must be a multiple of the store size. 16366 if (0 != (NumElems * FromEltSz) % ToEltSz) 16367 return SDValue(); 16368 16369 unsigned SizeRatio = FromEltSz / ToEltSz; 16370 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 16371 16372 // Create a type on which we perform the shuffle. 16373 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 16374 NumElems * SizeRatio); 16375 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 16376 16377 SDLoc DL(St); 16378 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 16379 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 16380 for (unsigned i = 0; i < NumElems; ++i) 16381 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 16382 : i * SizeRatio; 16383 16384 // Can't shuffle using an illegal type. 16385 if (!TLI.isTypeLegal(WideVecVT)) 16386 return SDValue(); 16387 16388 SDValue Shuff = DAG.getVectorShuffle( 16389 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); 16390 // At this point all of the data is stored at the bottom of the 16391 // register. We now need to save it to mem. 16392 16393 // Find the largest store unit 16394 MVT StoreType = MVT::i8; 16395 for (MVT Tp : MVT::integer_valuetypes()) { 16396 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 16397 StoreType = Tp; 16398 } 16399 // Didn't find a legal store type. 16400 if (!TLI.isTypeLegal(StoreType)) 16401 return SDValue(); 16402 16403 // Bitcast the original vector into a vector of store-size units 16404 EVT StoreVecVT = 16405 EVT::getVectorVT(*DAG.getContext(), StoreType, 16406 VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); 16407 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 16408 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 16409 SmallVector<SDValue, 8> Chains; 16410 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 16411 TLI.getPointerTy(DAG.getDataLayout())); 16412 SDValue BasePtr = St->getBasePtr(); 16413 16414 // Perform one or more big stores into memory. 16415 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); 16416 for (unsigned I = 0; I < E; I++) { 16417 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, 16418 ShuffWide, DAG.getIntPtrConstant(I, DL)); 16419 SDValue Ch = 16420 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), 16421 St->getAlign(), St->getMemOperand()->getFlags()); 16422 BasePtr = 16423 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); 16424 Chains.push_back(Ch); 16425 } 16426 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 16427 } 16428 16429 // Try taking a single vector store from an fpround (which would otherwise turn 16430 // into an expensive buildvector) and splitting it into a series of narrowing 16431 // stores. 16432 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, 16433 SelectionDAG &DAG) { 16434 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 16435 return SDValue(); 16436 SDValue Trunc = St->getValue(); 16437 if (Trunc->getOpcode() != ISD::FP_ROUND) 16438 return SDValue(); 16439 EVT FromVT = Trunc->getOperand(0).getValueType(); 16440 EVT ToVT = Trunc.getValueType(); 16441 if (!ToVT.isVector()) 16442 return SDValue(); 16443 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 16444 EVT ToEltVT = ToVT.getVectorElementType(); 16445 EVT FromEltVT = FromVT.getVectorElementType(); 16446 16447 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16) 16448 return SDValue(); 16449 16450 unsigned NumElements = 4; 16451 if (FromVT.getVectorNumElements() % NumElements != 0) 16452 return SDValue(); 16453 16454 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so 16455 // use the VMOVN over splitting the store. We are looking for patterns of: 16456 // !rev: 0 N 1 N+1 2 N+2 ... 16457 // rev: N 0 N+1 1 N+2 2 ... 16458 // The shuffle may either be a single source (in which case N = NumElts/2) or 16459 // two inputs extended with concat to the same size (in which case N = 16460 // NumElts). 16461 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) { 16462 ArrayRef<int> M = SVN->getMask(); 16463 unsigned NumElts = ToVT.getVectorNumElements(); 16464 if (SVN->getOperand(1).isUndef()) 16465 NumElts /= 2; 16466 16467 unsigned Off0 = Rev ? NumElts : 0; 16468 unsigned Off1 = Rev ? 0 : NumElts; 16469 16470 for (unsigned I = 0; I < NumElts; I += 2) { 16471 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2)) 16472 return false; 16473 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2)) 16474 return false; 16475 } 16476 16477 return true; 16478 }; 16479 16480 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0))) 16481 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true)) 16482 return SDValue(); 16483 16484 LLVMContext &C = *DAG.getContext(); 16485 SDLoc DL(St); 16486 // Details about the old store 16487 SDValue Ch = St->getChain(); 16488 SDValue BasePtr = St->getBasePtr(); 16489 Align Alignment = St->getOriginalAlign(); 16490 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 16491 AAMDNodes AAInfo = St->getAAInfo(); 16492 16493 // We split the store into slices of NumElements. fp16 trunc stores are vcvt 16494 // and then stored as truncating integer stores. 16495 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements); 16496 EVT NewToVT = EVT::getVectorVT( 16497 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements); 16498 16499 SmallVector<SDValue, 4> Stores; 16500 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 16501 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; 16502 SDValue NewPtr = 16503 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 16504 16505 SDValue Extract = 16506 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), 16507 DAG.getConstant(i * NumElements, DL, MVT::i32)); 16508 16509 SDValue FPTrunc = 16510 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16), 16511 Extract, DAG.getConstant(0, DL, MVT::i32)); 16512 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc); 16513 16514 SDValue Store = DAG.getTruncStore( 16515 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 16516 NewToVT, Alignment.value(), MMOFlags, AAInfo); 16517 Stores.push_back(Store); 16518 } 16519 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 16520 } 16521 16522 // Try taking a single vector store from an MVETRUNC (which would otherwise turn 16523 // into an expensive buildvector) and splitting it into a series of narrowing 16524 // stores. 16525 static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, 16526 SelectionDAG &DAG) { 16527 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 16528 return SDValue(); 16529 SDValue Trunc = St->getValue(); 16530 if (Trunc->getOpcode() != ARMISD::MVETRUNC) 16531 return SDValue(); 16532 EVT FromVT = Trunc->getOperand(0).getValueType(); 16533 EVT ToVT = Trunc.getValueType(); 16534 16535 LLVMContext &C = *DAG.getContext(); 16536 SDLoc DL(St); 16537 // Details about the old store 16538 SDValue Ch = St->getChain(); 16539 SDValue BasePtr = St->getBasePtr(); 16540 Align Alignment = St->getOriginalAlign(); 16541 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 16542 AAMDNodes AAInfo = St->getAAInfo(); 16543 16544 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(), 16545 FromVT.getVectorNumElements()); 16546 16547 SmallVector<SDValue, 4> Stores; 16548 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) { 16549 unsigned NewOffset = 16550 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8; 16551 SDValue NewPtr = 16552 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 16553 16554 SDValue Extract = Trunc.getOperand(i); 16555 SDValue Store = DAG.getTruncStore( 16556 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 16557 NewToVT, Alignment.value(), MMOFlags, AAInfo); 16558 Stores.push_back(Store); 16559 } 16560 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 16561 } 16562 16563 // Given a floating point store from an extracted vector, with an integer 16564 // VGETLANE that already exists, store the existing VGETLANEu directly. This can 16565 // help reduce fp register pressure, doesn't require the fp extract and allows 16566 // use of more integer post-inc stores not available with vstr. 16567 static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) { 16568 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 16569 return SDValue(); 16570 SDValue Extract = St->getValue(); 16571 EVT VT = Extract.getValueType(); 16572 // For now only uses f16. This may be useful for f32 too, but that will 16573 // be bitcast(extract), not the VGETLANEu we currently check here. 16574 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 16575 return SDValue(); 16576 16577 SDNode *GetLane = 16578 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32), 16579 {Extract.getOperand(0), Extract.getOperand(1)}); 16580 if (!GetLane) 16581 return SDValue(); 16582 16583 LLVMContext &C = *DAG.getContext(); 16584 SDLoc DL(St); 16585 // Create a new integer store to replace the existing floating point version. 16586 SDValue Ch = St->getChain(); 16587 SDValue BasePtr = St->getBasePtr(); 16588 Align Alignment = St->getOriginalAlign(); 16589 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 16590 AAMDNodes AAInfo = St->getAAInfo(); 16591 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits()); 16592 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr, 16593 St->getPointerInfo(), NewToVT, 16594 Alignment.value(), MMOFlags, AAInfo); 16595 16596 return Store; 16597 } 16598 16599 /// PerformSTORECombine - Target-specific dag combine xforms for 16600 /// ISD::STORE. 16601 static SDValue PerformSTORECombine(SDNode *N, 16602 TargetLowering::DAGCombinerInfo &DCI, 16603 const ARMSubtarget *Subtarget) { 16604 StoreSDNode *St = cast<StoreSDNode>(N); 16605 if (St->isVolatile()) 16606 return SDValue(); 16607 SDValue StVal = St->getValue(); 16608 EVT VT = StVal.getValueType(); 16609 16610 if (Subtarget->hasNEON()) 16611 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) 16612 return Store; 16613 16614 if (Subtarget->hasMVEIntegerOps()) { 16615 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) 16616 return NewToken; 16617 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG)) 16618 return NewChain; 16619 if (SDValue NewToken = 16620 PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG)) 16621 return NewToken; 16622 } 16623 16624 if (!ISD::isNormalStore(St)) 16625 return SDValue(); 16626 16627 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 16628 // ARM stores of arguments in the same cache line. 16629 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 16630 StVal.getNode()->hasOneUse()) { 16631 SelectionDAG &DAG = DCI.DAG; 16632 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 16633 SDLoc DL(St); 16634 SDValue BasePtr = St->getBasePtr(); 16635 SDValue NewST1 = DAG.getStore( 16636 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 16637 BasePtr, St->getPointerInfo(), St->getOriginalAlign(), 16638 St->getMemOperand()->getFlags()); 16639 16640 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 16641 DAG.getConstant(4, DL, MVT::i32)); 16642 return DAG.getStore(NewST1.getValue(0), DL, 16643 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 16644 OffsetPtr, St->getPointerInfo().getWithOffset(4), 16645 St->getOriginalAlign(), 16646 St->getMemOperand()->getFlags()); 16647 } 16648 16649 if (StVal.getValueType() == MVT::i64 && 16650 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 16651 16652 // Bitcast an i64 store extracted from a vector to f64. 16653 // Otherwise, the i64 value will be legalized to a pair of i32 values. 16654 SelectionDAG &DAG = DCI.DAG; 16655 SDLoc dl(StVal); 16656 SDValue IntVec = StVal.getOperand(0); 16657 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 16658 IntVec.getValueType().getVectorNumElements()); 16659 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 16660 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 16661 Vec, StVal.getOperand(1)); 16662 dl = SDLoc(N); 16663 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 16664 // Make the DAGCombiner fold the bitcasts. 16665 DCI.AddToWorklist(Vec.getNode()); 16666 DCI.AddToWorklist(ExtElt.getNode()); 16667 DCI.AddToWorklist(V.getNode()); 16668 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 16669 St->getPointerInfo(), St->getAlign(), 16670 St->getMemOperand()->getFlags(), St->getAAInfo()); 16671 } 16672 16673 // If this is a legal vector store, try to combine it into a VST1_UPD. 16674 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && 16675 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 16676 return CombineBaseUpdate(N, DCI); 16677 16678 return SDValue(); 16679 } 16680 16681 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 16682 /// can replace combinations of VMUL and VCVT (floating-point to integer) 16683 /// when the VMUL has a constant operand that is a power of 2. 16684 /// 16685 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 16686 /// vmul.f32 d16, d17, d16 16687 /// vcvt.s32.f32 d16, d16 16688 /// becomes: 16689 /// vcvt.s32.f32 d16, d16, #3 16690 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 16691 const ARMSubtarget *Subtarget) { 16692 if (!Subtarget->hasNEON()) 16693 return SDValue(); 16694 16695 SDValue Op = N->getOperand(0); 16696 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 16697 Op.getOpcode() != ISD::FMUL) 16698 return SDValue(); 16699 16700 SDValue ConstVec = Op->getOperand(1); 16701 if (!isa<BuildVectorSDNode>(ConstVec)) 16702 return SDValue(); 16703 16704 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 16705 uint32_t FloatBits = FloatTy.getSizeInBits(); 16706 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 16707 uint32_t IntBits = IntTy.getSizeInBits(); 16708 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 16709 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 16710 // These instructions only exist converting from f32 to i32. We can handle 16711 // smaller integers by generating an extra truncate, but larger ones would 16712 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 16713 // these intructions only support v2i32/v4i32 types. 16714 return SDValue(); 16715 } 16716 16717 BitVector UndefElements; 16718 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 16719 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 16720 if (C == -1 || C == 0 || C > 32) 16721 return SDValue(); 16722 16723 SDLoc dl(N); 16724 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 16725 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 16726 Intrinsic::arm_neon_vcvtfp2fxu; 16727 SDValue FixConv = DAG.getNode( 16728 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 16729 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 16730 DAG.getConstant(C, dl, MVT::i32)); 16731 16732 if (IntBits < FloatBits) 16733 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 16734 16735 return FixConv; 16736 } 16737 16738 static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, 16739 const ARMSubtarget *Subtarget) { 16740 if (!Subtarget->hasMVEFloatOps()) 16741 return SDValue(); 16742 16743 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x) 16744 // The second form can be more easily turned into a predicated vadd, and 16745 // possibly combined into a fma to become a predicated vfma. 16746 SDValue Op0 = N->getOperand(0); 16747 SDValue Op1 = N->getOperand(1); 16748 EVT VT = N->getValueType(0); 16749 SDLoc DL(N); 16750 16751 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set, 16752 // which these VMOV's represent. 16753 auto isIdentitySplat = [&](SDValue Op, bool NSZ) { 16754 if (Op.getOpcode() != ISD::BITCAST || 16755 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM) 16756 return false; 16757 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0); 16758 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ))) 16759 return true; 16760 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ))) 16761 return true; 16762 return false; 16763 }; 16764 16765 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT) 16766 std::swap(Op0, Op1); 16767 16768 if (Op1.getOpcode() != ISD::VSELECT) 16769 return SDValue(); 16770 16771 SDNodeFlags FaddFlags = N->getFlags(); 16772 bool NSZ = FaddFlags.hasNoSignedZeros(); 16773 if (!isIdentitySplat(Op1.getOperand(2), NSZ)) 16774 return SDValue(); 16775 16776 SDValue FAdd = 16777 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags); 16778 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags); 16779 } 16780 16781 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 16782 /// can replace combinations of VCVT (integer to floating-point) and VDIV 16783 /// when the VDIV has a constant operand that is a power of 2. 16784 /// 16785 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 16786 /// vcvt.f32.s32 d16, d16 16787 /// vdiv.f32 d16, d17, d16 16788 /// becomes: 16789 /// vcvt.f32.s32 d16, d16, #3 16790 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 16791 const ARMSubtarget *Subtarget) { 16792 if (!Subtarget->hasNEON()) 16793 return SDValue(); 16794 16795 SDValue Op = N->getOperand(0); 16796 unsigned OpOpcode = Op.getNode()->getOpcode(); 16797 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 16798 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 16799 return SDValue(); 16800 16801 SDValue ConstVec = N->getOperand(1); 16802 if (!isa<BuildVectorSDNode>(ConstVec)) 16803 return SDValue(); 16804 16805 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 16806 uint32_t FloatBits = FloatTy.getSizeInBits(); 16807 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 16808 uint32_t IntBits = IntTy.getSizeInBits(); 16809 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 16810 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 16811 // These instructions only exist converting from i32 to f32. We can handle 16812 // smaller integers by generating an extra extend, but larger ones would 16813 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 16814 // these intructions only support v2i32/v4i32 types. 16815 return SDValue(); 16816 } 16817 16818 BitVector UndefElements; 16819 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 16820 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 16821 if (C == -1 || C == 0 || C > 32) 16822 return SDValue(); 16823 16824 SDLoc dl(N); 16825 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 16826 SDValue ConvInput = Op.getOperand(0); 16827 if (IntBits < FloatBits) 16828 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 16829 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 16830 ConvInput); 16831 16832 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 16833 Intrinsic::arm_neon_vcvtfxu2fp; 16834 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 16835 Op.getValueType(), 16836 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 16837 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 16838 } 16839 16840 static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, 16841 const ARMSubtarget *ST) { 16842 if (!ST->hasMVEIntegerOps()) 16843 return SDValue(); 16844 16845 assert(N->getOpcode() == ISD::VECREDUCE_ADD); 16846 EVT ResVT = N->getValueType(0); 16847 SDValue N0 = N->getOperand(0); 16848 SDLoc dl(N); 16849 16850 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y) 16851 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD && 16852 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 || 16853 N0.getValueType() == MVT::v16i8)) { 16854 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0)); 16855 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1)); 16856 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1); 16857 } 16858 16859 // We are looking for something that will have illegal types if left alone, 16860 // but that we can convert to a single instruction under MVE. For example 16861 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A 16862 // or 16863 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B 16864 16865 // The legal cases are: 16866 // VADDV u/s 8/16/32 16867 // VMLAV u/s 8/16/32 16868 // VADDLV u/s 32 16869 // VMLALV u/s 16/32 16870 16871 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can 16872 // extend it and use v4i32 instead. 16873 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) { 16874 EVT AVT = A.getValueType(); 16875 return any_of(ExtTypes, [&](MVT Ty) { 16876 return AVT.getVectorNumElements() == Ty.getVectorNumElements() && 16877 AVT.bitsLE(Ty); 16878 }); 16879 }; 16880 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) { 16881 EVT AVT = A.getValueType(); 16882 if (!AVT.is128BitVector()) 16883 A = DAG.getNode(ExtendCode, dl, 16884 AVT.changeVectorElementType(MVT::getIntegerVT( 16885 128 / AVT.getVectorMinNumElements())), 16886 A); 16887 return A; 16888 }; 16889 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) { 16890 if (ResVT != RetTy || N0->getOpcode() != ExtendCode) 16891 return SDValue(); 16892 SDValue A = N0->getOperand(0); 16893 if (ExtTypeMatches(A, ExtTypes)) 16894 return ExtendIfNeeded(A, ExtendCode); 16895 return SDValue(); 16896 }; 16897 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode, 16898 ArrayRef<MVT> ExtTypes, SDValue &Mask) { 16899 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || 16900 !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) 16901 return SDValue(); 16902 Mask = N0->getOperand(0); 16903 SDValue Ext = N0->getOperand(1); 16904 if (Ext->getOpcode() != ExtendCode) 16905 return SDValue(); 16906 SDValue A = Ext->getOperand(0); 16907 if (ExtTypeMatches(A, ExtTypes)) 16908 return ExtendIfNeeded(A, ExtendCode); 16909 return SDValue(); 16910 }; 16911 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, 16912 SDValue &A, SDValue &B) { 16913 // For a vmla we are trying to match a larger pattern: 16914 // ExtA = sext/zext A 16915 // ExtB = sext/zext B 16916 // Mul = mul ExtA, ExtB 16917 // vecreduce.add Mul 16918 // There might also be en extra extend between the mul and the addreduce, so 16919 // long as the bitwidth is high enough to make them equivalent (for example 16920 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64). 16921 if (ResVT != RetTy) 16922 return false; 16923 SDValue Mul = N0; 16924 if (Mul->getOpcode() == ExtendCode && 16925 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= 16926 ResVT.getScalarSizeInBits()) 16927 Mul = Mul->getOperand(0); 16928 if (Mul->getOpcode() != ISD::MUL) 16929 return false; 16930 SDValue ExtA = Mul->getOperand(0); 16931 SDValue ExtB = Mul->getOperand(1); 16932 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode) 16933 return false; 16934 A = ExtA->getOperand(0); 16935 B = ExtB->getOperand(0); 16936 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) { 16937 A = ExtendIfNeeded(A, ExtendCode); 16938 B = ExtendIfNeeded(B, ExtendCode); 16939 return true; 16940 } 16941 return false; 16942 }; 16943 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, 16944 SDValue &A, SDValue &B, SDValue &Mask) { 16945 // Same as the pattern above with a select for the zero predicated lanes 16946 // ExtA = sext/zext A 16947 // ExtB = sext/zext B 16948 // Mul = mul ExtA, ExtB 16949 // N0 = select Mask, Mul, 0 16950 // vecreduce.add N0 16951 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || 16952 !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) 16953 return false; 16954 Mask = N0->getOperand(0); 16955 SDValue Mul = N0->getOperand(1); 16956 if (Mul->getOpcode() == ExtendCode && 16957 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= 16958 ResVT.getScalarSizeInBits()) 16959 Mul = Mul->getOperand(0); 16960 if (Mul->getOpcode() != ISD::MUL) 16961 return false; 16962 SDValue ExtA = Mul->getOperand(0); 16963 SDValue ExtB = Mul->getOperand(1); 16964 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode) 16965 return false; 16966 A = ExtA->getOperand(0); 16967 B = ExtB->getOperand(0); 16968 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) { 16969 A = ExtendIfNeeded(A, ExtendCode); 16970 B = ExtendIfNeeded(B, ExtendCode); 16971 return true; 16972 } 16973 return false; 16974 }; 16975 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) { 16976 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64 16977 // reductions. The operands are extended with MVEEXT, but as they are 16978 // reductions the lane orders do not matter. MVEEXT may be combined with 16979 // loads to produce two extending loads, or else they will be expanded to 16980 // VREV/VMOVL. 16981 EVT VT = Ops[0].getValueType(); 16982 if (VT == MVT::v16i8) { 16983 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) && 16984 "Unexpected illegal long reduction opcode"); 16985 bool IsUnsigned = Opcode == ARMISD::VMLALVu; 16986 16987 SDValue Ext0 = 16988 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl, 16989 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]); 16990 SDValue Ext1 = 16991 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl, 16992 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]); 16993 16994 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 16995 Ext0, Ext1); 16996 SDValue MLA1 = 16997 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl, 16998 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1), 16999 Ext0.getValue(1), Ext1.getValue(1)); 17000 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1)); 17001 } 17002 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops); 17003 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node, 17004 SDValue(Node.getNode(), 1)); 17005 }; 17006 17007 SDValue A, B; 17008 SDValue Mask; 17009 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 17010 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B); 17011 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 17012 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B); 17013 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32}, 17014 A, B)) 17015 return Create64bitNode(ARMISD::VMLALVs, {A, B}); 17016 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32}, 17017 A, B)) 17018 return Create64bitNode(ARMISD::VMLALVu, {A, B}); 17019 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B)) 17020 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 17021 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B)); 17022 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B)) 17023 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 17024 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B)); 17025 17026 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, 17027 Mask)) 17028 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask); 17029 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, 17030 Mask)) 17031 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask); 17032 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B, 17033 Mask)) 17034 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask}); 17035 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B, 17036 Mask)) 17037 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask}); 17038 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask)) 17039 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 17040 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask)); 17041 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask)) 17042 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 17043 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask)); 17044 17045 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8})) 17046 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); 17047 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) 17048 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); 17049 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32})) 17050 return Create64bitNode(ARMISD::VADDLVs, {A}); 17051 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32})) 17052 return Create64bitNode(ARMISD::VADDLVu, {A}); 17053 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8})) 17054 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 17055 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A)); 17056 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8})) 17057 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 17058 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A)); 17059 17060 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) 17061 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask); 17062 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) 17063 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask); 17064 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask)) 17065 return Create64bitNode(ARMISD::VADDLVps, {A, Mask}); 17066 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask)) 17067 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask}); 17068 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask)) 17069 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 17070 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask)); 17071 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask)) 17072 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 17073 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask)); 17074 17075 // Some complications. We can get a case where the two inputs of the mul are 17076 // the same, then the output sext will have been helpfully converted to a 17077 // zext. Turn it back. 17078 SDValue Op = N0; 17079 if (Op->getOpcode() == ISD::VSELECT) 17080 Op = Op->getOperand(1); 17081 if (Op->getOpcode() == ISD::ZERO_EXTEND && 17082 Op->getOperand(0)->getOpcode() == ISD::MUL) { 17083 SDValue Mul = Op->getOperand(0); 17084 if (Mul->getOperand(0) == Mul->getOperand(1) && 17085 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) { 17086 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul); 17087 if (Op != N0) 17088 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0), 17089 N0->getOperand(0), Ext, N0->getOperand(2)); 17090 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext); 17091 } 17092 } 17093 17094 return SDValue(); 17095 } 17096 17097 static SDValue PerformVMOVNCombine(SDNode *N, 17098 TargetLowering::DAGCombinerInfo &DCI) { 17099 SDValue Op0 = N->getOperand(0); 17100 SDValue Op1 = N->getOperand(1); 17101 unsigned IsTop = N->getConstantOperandVal(2); 17102 17103 // VMOVNT a undef -> a 17104 // VMOVNB a undef -> a 17105 // VMOVNB undef a -> a 17106 if (Op1->isUndef()) 17107 return Op0; 17108 if (Op0->isUndef() && !IsTop) 17109 return Op1; 17110 17111 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b) 17112 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b) 17113 if ((Op1->getOpcode() == ARMISD::VQMOVNs || 17114 Op1->getOpcode() == ARMISD::VQMOVNu) && 17115 Op1->getConstantOperandVal(2) == 0) 17116 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0), 17117 Op0, Op1->getOperand(1), N->getOperand(2)); 17118 17119 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from 17120 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting 17121 // into the top or bottom lanes. 17122 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 17123 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1)); 17124 APInt Op0DemandedElts = 17125 IsTop ? Op1DemandedElts 17126 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1)); 17127 17128 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 17129 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI)) 17130 return SDValue(N, 0); 17131 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI)) 17132 return SDValue(N, 0); 17133 17134 return SDValue(); 17135 } 17136 17137 static SDValue PerformVQMOVNCombine(SDNode *N, 17138 TargetLowering::DAGCombinerInfo &DCI) { 17139 SDValue Op0 = N->getOperand(0); 17140 unsigned IsTop = N->getConstantOperandVal(2); 17141 17142 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 17143 APInt Op0DemandedElts = 17144 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) 17145 : APInt::getHighBitsSet(2, 1)); 17146 17147 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 17148 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI)) 17149 return SDValue(N, 0); 17150 return SDValue(); 17151 } 17152 17153 static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) { 17154 SDLoc DL(N); 17155 SDValue Op0 = N->getOperand(0); 17156 SDValue Op1 = N->getOperand(1); 17157 17158 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from 17159 // uses of the intrinsics. 17160 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 17161 int ShiftAmt = C->getSExtValue(); 17162 if (ShiftAmt == 0) { 17163 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL); 17164 DAG.ReplaceAllUsesWith(N, Merge.getNode()); 17165 return SDValue(); 17166 } 17167 17168 if (ShiftAmt >= -32 && ShiftAmt < 0) { 17169 unsigned NewOpcode = 17170 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL; 17171 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1, 17172 DAG.getConstant(-ShiftAmt, DL, MVT::i32)); 17173 DAG.ReplaceAllUsesWith(N, NewShift.getNode()); 17174 return NewShift; 17175 } 17176 } 17177 17178 return SDValue(); 17179 } 17180 17181 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 17182 SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, 17183 DAGCombinerInfo &DCI) const { 17184 SelectionDAG &DAG = DCI.DAG; 17185 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 17186 switch (IntNo) { 17187 default: 17188 // Don't do anything for most intrinsics. 17189 break; 17190 17191 // Vector shifts: check for immediate versions and lower them. 17192 // Note: This is done during DAG combining instead of DAG legalizing because 17193 // the build_vectors for 64-bit vector element shift counts are generally 17194 // not legal, and it is hard to see their values after they get legalized to 17195 // loads from a constant pool. 17196 case Intrinsic::arm_neon_vshifts: 17197 case Intrinsic::arm_neon_vshiftu: 17198 case Intrinsic::arm_neon_vrshifts: 17199 case Intrinsic::arm_neon_vrshiftu: 17200 case Intrinsic::arm_neon_vrshiftn: 17201 case Intrinsic::arm_neon_vqshifts: 17202 case Intrinsic::arm_neon_vqshiftu: 17203 case Intrinsic::arm_neon_vqshiftsu: 17204 case Intrinsic::arm_neon_vqshiftns: 17205 case Intrinsic::arm_neon_vqshiftnu: 17206 case Intrinsic::arm_neon_vqshiftnsu: 17207 case Intrinsic::arm_neon_vqrshiftns: 17208 case Intrinsic::arm_neon_vqrshiftnu: 17209 case Intrinsic::arm_neon_vqrshiftnsu: { 17210 EVT VT = N->getOperand(1).getValueType(); 17211 int64_t Cnt; 17212 unsigned VShiftOpc = 0; 17213 17214 switch (IntNo) { 17215 case Intrinsic::arm_neon_vshifts: 17216 case Intrinsic::arm_neon_vshiftu: 17217 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 17218 VShiftOpc = ARMISD::VSHLIMM; 17219 break; 17220 } 17221 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 17222 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM 17223 : ARMISD::VSHRuIMM); 17224 break; 17225 } 17226 return SDValue(); 17227 17228 case Intrinsic::arm_neon_vrshifts: 17229 case Intrinsic::arm_neon_vrshiftu: 17230 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 17231 break; 17232 return SDValue(); 17233 17234 case Intrinsic::arm_neon_vqshifts: 17235 case Intrinsic::arm_neon_vqshiftu: 17236 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 17237 break; 17238 return SDValue(); 17239 17240 case Intrinsic::arm_neon_vqshiftsu: 17241 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 17242 break; 17243 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 17244 17245 case Intrinsic::arm_neon_vrshiftn: 17246 case Intrinsic::arm_neon_vqshiftns: 17247 case Intrinsic::arm_neon_vqshiftnu: 17248 case Intrinsic::arm_neon_vqshiftnsu: 17249 case Intrinsic::arm_neon_vqrshiftns: 17250 case Intrinsic::arm_neon_vqrshiftnu: 17251 case Intrinsic::arm_neon_vqrshiftnsu: 17252 // Narrowing shifts require an immediate right shift. 17253 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 17254 break; 17255 llvm_unreachable("invalid shift count for narrowing vector shift " 17256 "intrinsic"); 17257 17258 default: 17259 llvm_unreachable("unhandled vector shift"); 17260 } 17261 17262 switch (IntNo) { 17263 case Intrinsic::arm_neon_vshifts: 17264 case Intrinsic::arm_neon_vshiftu: 17265 // Opcode already set above. 17266 break; 17267 case Intrinsic::arm_neon_vrshifts: 17268 VShiftOpc = ARMISD::VRSHRsIMM; 17269 break; 17270 case Intrinsic::arm_neon_vrshiftu: 17271 VShiftOpc = ARMISD::VRSHRuIMM; 17272 break; 17273 case Intrinsic::arm_neon_vrshiftn: 17274 VShiftOpc = ARMISD::VRSHRNIMM; 17275 break; 17276 case Intrinsic::arm_neon_vqshifts: 17277 VShiftOpc = ARMISD::VQSHLsIMM; 17278 break; 17279 case Intrinsic::arm_neon_vqshiftu: 17280 VShiftOpc = ARMISD::VQSHLuIMM; 17281 break; 17282 case Intrinsic::arm_neon_vqshiftsu: 17283 VShiftOpc = ARMISD::VQSHLsuIMM; 17284 break; 17285 case Intrinsic::arm_neon_vqshiftns: 17286 VShiftOpc = ARMISD::VQSHRNsIMM; 17287 break; 17288 case Intrinsic::arm_neon_vqshiftnu: 17289 VShiftOpc = ARMISD::VQSHRNuIMM; 17290 break; 17291 case Intrinsic::arm_neon_vqshiftnsu: 17292 VShiftOpc = ARMISD::VQSHRNsuIMM; 17293 break; 17294 case Intrinsic::arm_neon_vqrshiftns: 17295 VShiftOpc = ARMISD::VQRSHRNsIMM; 17296 break; 17297 case Intrinsic::arm_neon_vqrshiftnu: 17298 VShiftOpc = ARMISD::VQRSHRNuIMM; 17299 break; 17300 case Intrinsic::arm_neon_vqrshiftnsu: 17301 VShiftOpc = ARMISD::VQRSHRNsuIMM; 17302 break; 17303 } 17304 17305 SDLoc dl(N); 17306 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 17307 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 17308 } 17309 17310 case Intrinsic::arm_neon_vshiftins: { 17311 EVT VT = N->getOperand(1).getValueType(); 17312 int64_t Cnt; 17313 unsigned VShiftOpc = 0; 17314 17315 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 17316 VShiftOpc = ARMISD::VSLIIMM; 17317 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 17318 VShiftOpc = ARMISD::VSRIIMM; 17319 else { 17320 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 17321 } 17322 17323 SDLoc dl(N); 17324 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 17325 N->getOperand(1), N->getOperand(2), 17326 DAG.getConstant(Cnt, dl, MVT::i32)); 17327 } 17328 17329 case Intrinsic::arm_neon_vqrshifts: 17330 case Intrinsic::arm_neon_vqrshiftu: 17331 // No immediate versions of these to check for. 17332 break; 17333 17334 case Intrinsic::arm_mve_vqdmlah: 17335 case Intrinsic::arm_mve_vqdmlash: 17336 case Intrinsic::arm_mve_vqrdmlah: 17337 case Intrinsic::arm_mve_vqrdmlash: 17338 case Intrinsic::arm_mve_vmla_n_predicated: 17339 case Intrinsic::arm_mve_vmlas_n_predicated: 17340 case Intrinsic::arm_mve_vqdmlah_predicated: 17341 case Intrinsic::arm_mve_vqdmlash_predicated: 17342 case Intrinsic::arm_mve_vqrdmlah_predicated: 17343 case Intrinsic::arm_mve_vqrdmlash_predicated: { 17344 // These intrinsics all take an i32 scalar operand which is narrowed to the 17345 // size of a single lane of the vector type they return. So we don't need 17346 // any bits of that operand above that point, which allows us to eliminate 17347 // uxth/sxth. 17348 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); 17349 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); 17350 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI)) 17351 return SDValue(); 17352 break; 17353 } 17354 17355 case Intrinsic::arm_mve_minv: 17356 case Intrinsic::arm_mve_maxv: 17357 case Intrinsic::arm_mve_minav: 17358 case Intrinsic::arm_mve_maxav: 17359 case Intrinsic::arm_mve_minv_predicated: 17360 case Intrinsic::arm_mve_maxv_predicated: 17361 case Intrinsic::arm_mve_minav_predicated: 17362 case Intrinsic::arm_mve_maxav_predicated: { 17363 // These intrinsics all take an i32 scalar operand which is narrowed to the 17364 // size of a single lane of the vector type they take as the other input. 17365 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits(); 17366 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); 17367 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 17368 return SDValue(); 17369 break; 17370 } 17371 17372 case Intrinsic::arm_mve_addv: { 17373 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node, 17374 // which allow PerformADDVecReduce to turn it into VADDLV when possible. 17375 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 17376 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs; 17377 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1)); 17378 } 17379 17380 case Intrinsic::arm_mve_addlv: 17381 case Intrinsic::arm_mve_addlv_predicated: { 17382 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR 17383 // which recombines the two outputs into an i64 17384 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 17385 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ? 17386 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) : 17387 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps); 17388 17389 SmallVector<SDValue, 4> Ops; 17390 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++) 17391 if (i != 2) // skip the unsigned flag 17392 Ops.push_back(N->getOperand(i)); 17393 17394 SDLoc dl(N); 17395 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops); 17396 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0), 17397 val.getValue(1)); 17398 } 17399 } 17400 17401 return SDValue(); 17402 } 17403 17404 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 17405 /// lowers them. As with the vector shift intrinsics, this is done during DAG 17406 /// combining instead of DAG legalizing because the build_vectors for 64-bit 17407 /// vector element shift counts are generally not legal, and it is hard to see 17408 /// their values after they get legalized to loads from a constant pool. 17409 static SDValue PerformShiftCombine(SDNode *N, 17410 TargetLowering::DAGCombinerInfo &DCI, 17411 const ARMSubtarget *ST) { 17412 SelectionDAG &DAG = DCI.DAG; 17413 EVT VT = N->getValueType(0); 17414 17415 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 17416 N->getOperand(0)->getOpcode() == ISD::AND && 17417 N->getOperand(0)->hasOneUse()) { 17418 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 17419 return SDValue(); 17420 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 17421 // usually show up because instcombine prefers to canonicalize it to 17422 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 17423 // out of GEP lowering in some cases. 17424 SDValue N0 = N->getOperand(0); 17425 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 17426 if (!ShiftAmtNode) 17427 return SDValue(); 17428 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 17429 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 17430 if (!AndMaskNode) 17431 return SDValue(); 17432 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 17433 // Don't transform uxtb/uxth. 17434 if (AndMask == 255 || AndMask == 65535) 17435 return SDValue(); 17436 if (isMask_32(AndMask)) { 17437 uint32_t MaskedBits = countLeadingZeros(AndMask); 17438 if (MaskedBits > ShiftAmt) { 17439 SDLoc DL(N); 17440 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 17441 DAG.getConstant(MaskedBits, DL, MVT::i32)); 17442 return DAG.getNode( 17443 ISD::SRL, DL, MVT::i32, SHL, 17444 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 17445 } 17446 } 17447 } 17448 17449 // Nothing to be done for scalar shifts. 17450 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17451 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 17452 return SDValue(); 17453 if (ST->hasMVEIntegerOps()) 17454 return SDValue(); 17455 17456 int64_t Cnt; 17457 17458 switch (N->getOpcode()) { 17459 default: llvm_unreachable("unexpected shift opcode"); 17460 17461 case ISD::SHL: 17462 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 17463 SDLoc dl(N); 17464 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 17465 DAG.getConstant(Cnt, dl, MVT::i32)); 17466 } 17467 break; 17468 17469 case ISD::SRA: 17470 case ISD::SRL: 17471 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 17472 unsigned VShiftOpc = 17473 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 17474 SDLoc dl(N); 17475 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 17476 DAG.getConstant(Cnt, dl, MVT::i32)); 17477 } 17478 } 17479 return SDValue(); 17480 } 17481 17482 // Look for a sign/zero/fpextend extend of a larger than legal load. This can be 17483 // split into multiple extending loads, which are simpler to deal with than an 17484 // arbitrary extend. For fp extends we use an integer extending load and a VCVTL 17485 // to convert the type to an f32. 17486 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { 17487 SDValue N0 = N->getOperand(0); 17488 if (N0.getOpcode() != ISD::LOAD) 17489 return SDValue(); 17490 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); 17491 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || 17492 LD->getExtensionType() != ISD::NON_EXTLOAD) 17493 return SDValue(); 17494 EVT FromVT = LD->getValueType(0); 17495 EVT ToVT = N->getValueType(0); 17496 if (!ToVT.isVector()) 17497 return SDValue(); 17498 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 17499 EVT ToEltVT = ToVT.getVectorElementType(); 17500 EVT FromEltVT = FromVT.getVectorElementType(); 17501 17502 unsigned NumElements = 0; 17503 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8) 17504 NumElements = 4; 17505 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16) 17506 NumElements = 4; 17507 if (NumElements == 0 || 17508 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) || 17509 FromVT.getVectorNumElements() % NumElements != 0 || 17510 !isPowerOf2_32(NumElements)) 17511 return SDValue(); 17512 17513 LLVMContext &C = *DAG.getContext(); 17514 SDLoc DL(LD); 17515 // Details about the old load 17516 SDValue Ch = LD->getChain(); 17517 SDValue BasePtr = LD->getBasePtr(); 17518 Align Alignment = LD->getOriginalAlign(); 17519 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 17520 AAMDNodes AAInfo = LD->getAAInfo(); 17521 17522 ISD::LoadExtType NewExtType = 17523 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 17524 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 17525 EVT NewFromVT = EVT::getVectorVT( 17526 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); 17527 EVT NewToVT = EVT::getVectorVT( 17528 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); 17529 17530 SmallVector<SDValue, 4> Loads; 17531 SmallVector<SDValue, 4> Chains; 17532 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 17533 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; 17534 SDValue NewPtr = 17535 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 17536 17537 SDValue NewLoad = 17538 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 17539 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 17540 Alignment, MMOFlags, AAInfo); 17541 Loads.push_back(NewLoad); 17542 Chains.push_back(SDValue(NewLoad.getNode(), 1)); 17543 } 17544 17545 // Float truncs need to extended with VCVTB's into their floating point types. 17546 if (FromEltVT == MVT::f16) { 17547 SmallVector<SDValue, 4> Extends; 17548 17549 for (unsigned i = 0; i < Loads.size(); i++) { 17550 SDValue LoadBC = 17551 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]); 17552 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC, 17553 DAG.getConstant(0, DL, MVT::i32)); 17554 Extends.push_back(FPExt); 17555 } 17556 17557 Loads = Extends; 17558 } 17559 17560 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 17561 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 17562 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads); 17563 } 17564 17565 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 17566 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 17567 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 17568 const ARMSubtarget *ST) { 17569 SDValue N0 = N->getOperand(0); 17570 17571 // Check for sign- and zero-extensions of vector extract operations of 8- and 17572 // 16-bit vector elements. NEON and MVE support these directly. They are 17573 // handled during DAG combining because type legalization will promote them 17574 // to 32-bit types and it is messy to recognize the operations after that. 17575 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) && 17576 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 17577 SDValue Vec = N0.getOperand(0); 17578 SDValue Lane = N0.getOperand(1); 17579 EVT VT = N->getValueType(0); 17580 EVT EltVT = N0.getValueType(); 17581 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17582 17583 if (VT == MVT::i32 && 17584 (EltVT == MVT::i8 || EltVT == MVT::i16) && 17585 TLI.isTypeLegal(Vec.getValueType()) && 17586 isa<ConstantSDNode>(Lane)) { 17587 17588 unsigned Opc = 0; 17589 switch (N->getOpcode()) { 17590 default: llvm_unreachable("unexpected opcode"); 17591 case ISD::SIGN_EXTEND: 17592 Opc = ARMISD::VGETLANEs; 17593 break; 17594 case ISD::ZERO_EXTEND: 17595 case ISD::ANY_EXTEND: 17596 Opc = ARMISD::VGETLANEu; 17597 break; 17598 } 17599 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 17600 } 17601 } 17602 17603 if (ST->hasMVEIntegerOps()) 17604 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 17605 return NewLoad; 17606 17607 return SDValue(); 17608 } 17609 17610 static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, 17611 const ARMSubtarget *ST) { 17612 if (ST->hasMVEFloatOps()) 17613 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 17614 return NewLoad; 17615 17616 return SDValue(); 17617 } 17618 17619 // Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating 17620 // constant bounds. 17621 static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, 17622 const ARMSubtarget *Subtarget) { 17623 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) && 17624 !Subtarget->isThumb2()) 17625 return SDValue(); 17626 17627 EVT VT = Op.getValueType(); 17628 SDValue Op0 = Op.getOperand(0); 17629 17630 if (VT != MVT::i32 || 17631 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) || 17632 !isa<ConstantSDNode>(Op.getOperand(1)) || 17633 !isa<ConstantSDNode>(Op0.getOperand(1))) 17634 return SDValue(); 17635 17636 SDValue Min = Op; 17637 SDValue Max = Op0; 17638 SDValue Input = Op0.getOperand(0); 17639 if (Min.getOpcode() == ISD::SMAX) 17640 std::swap(Min, Max); 17641 17642 APInt MinC = Min.getConstantOperandAPInt(1); 17643 APInt MaxC = Max.getConstantOperandAPInt(1); 17644 17645 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX || 17646 !(MinC + 1).isPowerOf2()) 17647 return SDValue(); 17648 17649 SDLoc DL(Op); 17650 if (MinC == ~MaxC) 17651 return DAG.getNode(ARMISD::SSAT, DL, VT, Input, 17652 DAG.getConstant(MinC.countTrailingOnes(), DL, VT)); 17653 if (MaxC == 0) 17654 return DAG.getNode(ARMISD::USAT, DL, VT, Input, 17655 DAG.getConstant(MinC.countTrailingOnes(), DL, VT)); 17656 17657 return SDValue(); 17658 } 17659 17660 /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating 17661 /// saturates. 17662 static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, 17663 const ARMSubtarget *ST) { 17664 EVT VT = N->getValueType(0); 17665 SDValue N0 = N->getOperand(0); 17666 17667 if (VT == MVT::i32) 17668 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST); 17669 17670 if (!ST->hasMVEIntegerOps()) 17671 return SDValue(); 17672 17673 if (SDValue V = PerformVQDMULHCombine(N, DAG)) 17674 return V; 17675 17676 if (VT != MVT::v4i32 && VT != MVT::v8i16) 17677 return SDValue(); 17678 17679 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) { 17680 // Check one is a smin and the other is a smax 17681 if (Min->getOpcode() != ISD::SMIN) 17682 std::swap(Min, Max); 17683 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX) 17684 return false; 17685 17686 APInt SaturateC; 17687 if (VT == MVT::v4i32) 17688 SaturateC = APInt(32, (1 << 15) - 1, true); 17689 else //if (VT == MVT::v8i16) 17690 SaturateC = APInt(16, (1 << 7) - 1, true); 17691 17692 APInt MinC, MaxC; 17693 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || 17694 MinC != SaturateC) 17695 return false; 17696 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) || 17697 MaxC != ~SaturateC) 17698 return false; 17699 return true; 17700 }; 17701 17702 if (IsSignedSaturate(N, N0.getNode())) { 17703 SDLoc DL(N); 17704 MVT ExtVT, HalfVT; 17705 if (VT == MVT::v4i32) { 17706 HalfVT = MVT::v8i16; 17707 ExtVT = MVT::v4i16; 17708 } else { // if (VT == MVT::v8i16) 17709 HalfVT = MVT::v16i8; 17710 ExtVT = MVT::v8i8; 17711 } 17712 17713 // Create a VQMOVNB with undef top lanes, then signed extended into the top 17714 // half. That extend will hopefully be removed if only the bottom bits are 17715 // demanded (though a truncating store, for example). 17716 SDValue VQMOVN = 17717 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT), 17718 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32)); 17719 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); 17720 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast, 17721 DAG.getValueType(ExtVT)); 17722 } 17723 17724 auto IsUnsignedSaturate = [&](SDNode *Min) { 17725 // For unsigned, we just need to check for <= 0xffff 17726 if (Min->getOpcode() != ISD::UMIN) 17727 return false; 17728 17729 APInt SaturateC; 17730 if (VT == MVT::v4i32) 17731 SaturateC = APInt(32, (1 << 16) - 1, true); 17732 else //if (VT == MVT::v8i16) 17733 SaturateC = APInt(16, (1 << 8) - 1, true); 17734 17735 APInt MinC; 17736 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || 17737 MinC != SaturateC) 17738 return false; 17739 return true; 17740 }; 17741 17742 if (IsUnsignedSaturate(N)) { 17743 SDLoc DL(N); 17744 MVT HalfVT; 17745 unsigned ExtConst; 17746 if (VT == MVT::v4i32) { 17747 HalfVT = MVT::v8i16; 17748 ExtConst = 0x0000FFFF; 17749 } else { //if (VT == MVT::v8i16) 17750 HalfVT = MVT::v16i8; 17751 ExtConst = 0x00FF; 17752 } 17753 17754 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with 17755 // an AND. That extend will hopefully be removed if only the bottom bits are 17756 // demanded (though a truncating store, for example). 17757 SDValue VQMOVN = 17758 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0, 17759 DAG.getConstant(0, DL, MVT::i32)); 17760 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); 17761 return DAG.getNode(ISD::AND, DL, VT, Bitcast, 17762 DAG.getConstant(ExtConst, DL, VT)); 17763 } 17764 17765 return SDValue(); 17766 } 17767 17768 static const APInt *isPowerOf2Constant(SDValue V) { 17769 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 17770 if (!C) 17771 return nullptr; 17772 const APInt *CV = &C->getAPIntValue(); 17773 return CV->isPowerOf2() ? CV : nullptr; 17774 } 17775 17776 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 17777 // If we have a CMOV, OR and AND combination such as: 17778 // if (x & CN) 17779 // y |= CM; 17780 // 17781 // And: 17782 // * CN is a single bit; 17783 // * All bits covered by CM are known zero in y 17784 // 17785 // Then we can convert this into a sequence of BFI instructions. This will 17786 // always be a win if CM is a single bit, will always be no worse than the 17787 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 17788 // three bits (due to the extra IT instruction). 17789 17790 SDValue Op0 = CMOV->getOperand(0); 17791 SDValue Op1 = CMOV->getOperand(1); 17792 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 17793 auto CC = CCNode->getAPIntValue().getLimitedValue(); 17794 SDValue CmpZ = CMOV->getOperand(4); 17795 17796 // The compare must be against zero. 17797 if (!isNullConstant(CmpZ->getOperand(1))) 17798 return SDValue(); 17799 17800 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 17801 SDValue And = CmpZ->getOperand(0); 17802 if (And->getOpcode() != ISD::AND) 17803 return SDValue(); 17804 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 17805 if (!AndC) 17806 return SDValue(); 17807 SDValue X = And->getOperand(0); 17808 17809 if (CC == ARMCC::EQ) { 17810 // We're performing an "equal to zero" compare. Swap the operands so we 17811 // canonicalize on a "not equal to zero" compare. 17812 std::swap(Op0, Op1); 17813 } else { 17814 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 17815 } 17816 17817 if (Op1->getOpcode() != ISD::OR) 17818 return SDValue(); 17819 17820 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 17821 if (!OrC) 17822 return SDValue(); 17823 SDValue Y = Op1->getOperand(0); 17824 17825 if (Op0 != Y) 17826 return SDValue(); 17827 17828 // Now, is it profitable to continue? 17829 APInt OrCI = OrC->getAPIntValue(); 17830 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 17831 if (OrCI.countPopulation() > Heuristic) 17832 return SDValue(); 17833 17834 // Lastly, can we determine that the bits defined by OrCI 17835 // are zero in Y? 17836 KnownBits Known = DAG.computeKnownBits(Y); 17837 if ((OrCI & Known.Zero) != OrCI) 17838 return SDValue(); 17839 17840 // OK, we can do the combine. 17841 SDValue V = Y; 17842 SDLoc dl(X); 17843 EVT VT = X.getValueType(); 17844 unsigned BitInX = AndC->logBase2(); 17845 17846 if (BitInX != 0) { 17847 // We must shift X first. 17848 X = DAG.getNode(ISD::SRL, dl, VT, X, 17849 DAG.getConstant(BitInX, dl, VT)); 17850 } 17851 17852 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 17853 BitInY < NumActiveBits; ++BitInY) { 17854 if (OrCI[BitInY] == 0) 17855 continue; 17856 APInt Mask(VT.getSizeInBits(), 0); 17857 Mask.setBit(BitInY); 17858 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 17859 // Confusingly, the operand is an *inverted* mask. 17860 DAG.getConstant(~Mask, dl, VT)); 17861 } 17862 17863 return V; 17864 } 17865 17866 // Given N, the value controlling the conditional branch, search for the loop 17867 // intrinsic, returning it, along with how the value is used. We need to handle 17868 // patterns such as the following: 17869 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) 17870 // (brcond (setcc (loop.decrement), 0, eq), exit) 17871 // (brcond (setcc (loop.decrement), 0, ne), header) 17872 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, 17873 bool &Negate) { 17874 switch (N->getOpcode()) { 17875 default: 17876 break; 17877 case ISD::XOR: { 17878 if (!isa<ConstantSDNode>(N.getOperand(1))) 17879 return SDValue(); 17880 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) 17881 return SDValue(); 17882 Negate = !Negate; 17883 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); 17884 } 17885 case ISD::SETCC: { 17886 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); 17887 if (!Const) 17888 return SDValue(); 17889 if (Const->isZero()) 17890 Imm = 0; 17891 else if (Const->isOne()) 17892 Imm = 1; 17893 else 17894 return SDValue(); 17895 CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); 17896 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); 17897 } 17898 case ISD::INTRINSIC_W_CHAIN: { 17899 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); 17900 if (IntOp != Intrinsic::test_start_loop_iterations && 17901 IntOp != Intrinsic::loop_decrement_reg) 17902 return SDValue(); 17903 return N; 17904 } 17905 } 17906 return SDValue(); 17907 } 17908 17909 static SDValue PerformHWLoopCombine(SDNode *N, 17910 TargetLowering::DAGCombinerInfo &DCI, 17911 const ARMSubtarget *ST) { 17912 17913 // The hwloop intrinsics that we're interested are used for control-flow, 17914 // either for entering or exiting the loop: 17915 // - test.start.loop.iterations will test whether its operand is zero. If it 17916 // is zero, the proceeding branch should not enter the loop. 17917 // - loop.decrement.reg also tests whether its operand is zero. If it is 17918 // zero, the proceeding branch should not branch back to the beginning of 17919 // the loop. 17920 // So here, we need to check that how the brcond is using the result of each 17921 // of the intrinsics to ensure that we're branching to the right place at the 17922 // right time. 17923 17924 ISD::CondCode CC; 17925 SDValue Cond; 17926 int Imm = 1; 17927 bool Negate = false; 17928 SDValue Chain = N->getOperand(0); 17929 SDValue Dest; 17930 17931 if (N->getOpcode() == ISD::BRCOND) { 17932 CC = ISD::SETEQ; 17933 Cond = N->getOperand(1); 17934 Dest = N->getOperand(2); 17935 } else { 17936 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); 17937 CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 17938 Cond = N->getOperand(2); 17939 Dest = N->getOperand(4); 17940 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { 17941 if (!Const->isOne() && !Const->isZero()) 17942 return SDValue(); 17943 Imm = Const->getZExtValue(); 17944 } else 17945 return SDValue(); 17946 } 17947 17948 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); 17949 if (!Int) 17950 return SDValue(); 17951 17952 if (Negate) 17953 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32); 17954 17955 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { 17956 return (CC == ISD::SETEQ && Imm == 0) || 17957 (CC == ISD::SETNE && Imm == 1) || 17958 (CC == ISD::SETLT && Imm == 1) || 17959 (CC == ISD::SETULT && Imm == 1); 17960 }; 17961 17962 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { 17963 return (CC == ISD::SETEQ && Imm == 1) || 17964 (CC == ISD::SETNE && Imm == 0) || 17965 (CC == ISD::SETGT && Imm == 0) || 17966 (CC == ISD::SETUGT && Imm == 0) || 17967 (CC == ISD::SETGE && Imm == 1) || 17968 (CC == ISD::SETUGE && Imm == 1); 17969 }; 17970 17971 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && 17972 "unsupported condition"); 17973 17974 SDLoc dl(Int); 17975 SelectionDAG &DAG = DCI.DAG; 17976 SDValue Elements = Int.getOperand(2); 17977 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); 17978 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) 17979 && "expected single br user"); 17980 SDNode *Br = *N->use_begin(); 17981 SDValue OtherTarget = Br->getOperand(1); 17982 17983 // Update the unconditional branch to branch to the given Dest. 17984 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { 17985 SDValue NewBrOps[] = { Br->getOperand(0), Dest }; 17986 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); 17987 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); 17988 }; 17989 17990 if (IntOp == Intrinsic::test_start_loop_iterations) { 17991 SDValue Res; 17992 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements); 17993 // We expect this 'instruction' to branch when the counter is zero. 17994 if (IsTrueIfZero(CC, Imm)) { 17995 SDValue Ops[] = {Chain, Setup, Dest}; 17996 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 17997 } else { 17998 // The logic is the reverse of what we need for WLS, so find the other 17999 // basic block target: the target of the proceeding br. 18000 UpdateUncondBr(Br, Dest, DAG); 18001 18002 SDValue Ops[] = {Chain, Setup, OtherTarget}; 18003 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 18004 } 18005 // Update LR count to the new value 18006 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup); 18007 // Update chain 18008 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0)); 18009 return Res; 18010 } else { 18011 SDValue Size = DAG.getTargetConstant( 18012 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); 18013 SDValue Args[] = { Int.getOperand(0), Elements, Size, }; 18014 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, 18015 DAG.getVTList(MVT::i32, MVT::Other), Args); 18016 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); 18017 18018 // We expect this instruction to branch when the count is not zero. 18019 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; 18020 18021 // Update the unconditional branch to target the loop preheader if we've 18022 // found the condition has been reversed. 18023 if (Target == OtherTarget) 18024 UpdateUncondBr(Br, Dest, DAG); 18025 18026 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 18027 SDValue(LoopDec.getNode(), 1), Chain); 18028 18029 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; 18030 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); 18031 } 18032 return SDValue(); 18033 } 18034 18035 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 18036 SDValue 18037 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 18038 SDValue Cmp = N->getOperand(4); 18039 if (Cmp.getOpcode() != ARMISD::CMPZ) 18040 // Only looking at NE cases. 18041 return SDValue(); 18042 18043 EVT VT = N->getValueType(0); 18044 SDLoc dl(N); 18045 SDValue LHS = Cmp.getOperand(0); 18046 SDValue RHS = Cmp.getOperand(1); 18047 SDValue Chain = N->getOperand(0); 18048 SDValue BB = N->getOperand(1); 18049 SDValue ARMcc = N->getOperand(2); 18050 ARMCC::CondCodes CC = 18051 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 18052 18053 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 18054 // -> (brcond Chain BB CC CPSR Cmp) 18055 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 18056 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 18057 LHS->getOperand(0)->hasOneUse()) { 18058 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 18059 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 18060 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 18061 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 18062 if ((LHS00C && LHS00C->getZExtValue() == 0) && 18063 (LHS01C && LHS01C->getZExtValue() == 1) && 18064 (LHS1C && LHS1C->getZExtValue() == 1) && 18065 (RHSC && RHSC->getZExtValue() == 0)) { 18066 return DAG.getNode( 18067 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 18068 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 18069 } 18070 } 18071 18072 return SDValue(); 18073 } 18074 18075 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 18076 SDValue 18077 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 18078 SDValue Cmp = N->getOperand(4); 18079 if (Cmp.getOpcode() != ARMISD::CMPZ) 18080 // Only looking at EQ and NE cases. 18081 return SDValue(); 18082 18083 EVT VT = N->getValueType(0); 18084 SDLoc dl(N); 18085 SDValue LHS = Cmp.getOperand(0); 18086 SDValue RHS = Cmp.getOperand(1); 18087 SDValue FalseVal = N->getOperand(0); 18088 SDValue TrueVal = N->getOperand(1); 18089 SDValue ARMcc = N->getOperand(2); 18090 ARMCC::CondCodes CC = 18091 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 18092 18093 // BFI is only available on V6T2+. 18094 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 18095 SDValue R = PerformCMOVToBFICombine(N, DAG); 18096 if (R) 18097 return R; 18098 } 18099 18100 // Simplify 18101 // mov r1, r0 18102 // cmp r1, x 18103 // mov r0, y 18104 // moveq r0, x 18105 // to 18106 // cmp r0, x 18107 // movne r0, y 18108 // 18109 // mov r1, r0 18110 // cmp r1, x 18111 // mov r0, x 18112 // movne r0, y 18113 // to 18114 // cmp r0, x 18115 // movne r0, y 18116 /// FIXME: Turn this into a target neutral optimization? 18117 SDValue Res; 18118 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 18119 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 18120 N->getOperand(3), Cmp); 18121 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 18122 SDValue ARMcc; 18123 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 18124 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 18125 N->getOperand(3), NewCmp); 18126 } 18127 18128 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 18129 // -> (cmov F T CC CPSR Cmp) 18130 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 18131 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 18132 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 18133 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 18134 if ((LHS0C && LHS0C->getZExtValue() == 0) && 18135 (LHS1C && LHS1C->getZExtValue() == 1) && 18136 (RHSC && RHSC->getZExtValue() == 0)) { 18137 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 18138 LHS->getOperand(2), LHS->getOperand(3), 18139 LHS->getOperand(4)); 18140 } 18141 } 18142 18143 if (!VT.isInteger()) 18144 return SDValue(); 18145 18146 // Fold away an unneccessary CMPZ/CMOV 18147 // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) -> 18148 // if C1==EQ -> CMOV A, B, C2, $cpsr, D 18149 // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D 18150 if (N->getConstantOperandVal(2) == ARMCC::EQ || 18151 N->getConstantOperandVal(2) == ARMCC::NE) { 18152 ARMCC::CondCodes Cond; 18153 if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) { 18154 if (N->getConstantOperandVal(2) == ARMCC::NE) 18155 Cond = ARMCC::getOppositeCondition(Cond); 18156 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), 18157 N->getOperand(1), 18158 DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32), 18159 N->getOperand(3), C); 18160 } 18161 } 18162 18163 // Materialize a boolean comparison for integers so we can avoid branching. 18164 if (isNullConstant(FalseVal)) { 18165 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 18166 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 18167 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 18168 // right 5 bits will make that 32 be 1, otherwise it will be 0. 18169 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 18170 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 18171 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 18172 DAG.getConstant(5, dl, MVT::i32)); 18173 } else { 18174 // CMOV 0, 1, ==, (CMPZ x, y) -> 18175 // (ADDCARRY (SUB x, y), t:0, t:1) 18176 // where t = (SUBCARRY 0, (SUB x, y), 0) 18177 // 18178 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 18179 // x != y. In other words, a carry C == 1 when x == y, C == 0 18180 // otherwise. 18181 // The final ADDCARRY computes 18182 // x - y + (0 - (x - y)) + C == C 18183 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 18184 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 18185 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 18186 // ISD::SUBCARRY returns a borrow but we want the carry here 18187 // actually. 18188 SDValue Carry = 18189 DAG.getNode(ISD::SUB, dl, MVT::i32, 18190 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 18191 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 18192 } 18193 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 18194 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 18195 // This seems pointless but will allow us to combine it further below. 18196 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 18197 SDValue Sub = 18198 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 18199 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 18200 Sub.getValue(1), SDValue()); 18201 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 18202 N->getOperand(3), CPSRGlue.getValue(1)); 18203 FalseVal = Sub; 18204 } 18205 } else if (isNullConstant(TrueVal)) { 18206 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 18207 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 18208 // This seems pointless but will allow us to combine it further below 18209 // Note that we change == for != as this is the dual for the case above. 18210 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 18211 SDValue Sub = 18212 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 18213 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 18214 Sub.getValue(1), SDValue()); 18215 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 18216 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 18217 N->getOperand(3), CPSRGlue.getValue(1)); 18218 FalseVal = Sub; 18219 } 18220 } 18221 18222 // On Thumb1, the DAG above may be further combined if z is a power of 2 18223 // (z == 2 ^ K). 18224 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 18225 // t1 = (USUBO (SUB x, y), 1) 18226 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 18227 // Result = if K != 0 then (SHL t2:0, K) else t2:0 18228 // 18229 // This also handles the special case of comparing against zero; it's 18230 // essentially, the same pattern, except there's no SUBS: 18231 // CMOV x, z, !=, (CMPZ x, 0) -> 18232 // t1 = (USUBO x, 1) 18233 // t2 = (SUBCARRY x, t1:0, t1:1) 18234 // Result = if K != 0 then (SHL t2:0, K) else t2:0 18235 const APInt *TrueConst; 18236 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 18237 ((FalseVal.getOpcode() == ARMISD::SUBS && 18238 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 18239 (FalseVal == LHS && isNullConstant(RHS))) && 18240 (TrueConst = isPowerOf2Constant(TrueVal))) { 18241 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 18242 unsigned ShiftAmount = TrueConst->logBase2(); 18243 if (ShiftAmount) 18244 TrueVal = DAG.getConstant(1, dl, VT); 18245 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 18246 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 18247 18248 if (ShiftAmount) 18249 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 18250 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 18251 } 18252 18253 if (Res.getNode()) { 18254 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 18255 // Capture demanded bits information that would be otherwise lost. 18256 if (Known.Zero == 0xfffffffe) 18257 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 18258 DAG.getValueType(MVT::i1)); 18259 else if (Known.Zero == 0xffffff00) 18260 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 18261 DAG.getValueType(MVT::i8)); 18262 else if (Known.Zero == 0xffff0000) 18263 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 18264 DAG.getValueType(MVT::i16)); 18265 } 18266 18267 return Res; 18268 } 18269 18270 static SDValue PerformBITCASTCombine(SDNode *N, 18271 TargetLowering::DAGCombinerInfo &DCI, 18272 const ARMSubtarget *ST) { 18273 SelectionDAG &DAG = DCI.DAG; 18274 SDValue Src = N->getOperand(0); 18275 EVT DstVT = N->getValueType(0); 18276 18277 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE. 18278 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) { 18279 EVT SrcVT = Src.getValueType(); 18280 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits()) 18281 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0)); 18282 } 18283 18284 // We may have a bitcast of something that has already had this bitcast 18285 // combine performed on it, so skip past any VECTOR_REG_CASTs. 18286 while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST) 18287 Src = Src.getOperand(0); 18288 18289 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that 18290 // would be generated is at least the width of the element type. 18291 EVT SrcVT = Src.getValueType(); 18292 if ((Src.getOpcode() == ARMISD::VMOVIMM || 18293 Src.getOpcode() == ARMISD::VMVNIMM || 18294 Src.getOpcode() == ARMISD::VMOVFPIMM) && 18295 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() && 18296 DAG.getDataLayout().isBigEndian()) 18297 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src); 18298 18299 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x 18300 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) 18301 return R; 18302 18303 return SDValue(); 18304 } 18305 18306 // Some combines for the MVETrunc truncations legalizer helper. Also lowers the 18307 // node into stack operations after legalizeOps. 18308 SDValue ARMTargetLowering::PerformMVETruncCombine( 18309 SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { 18310 SelectionDAG &DAG = DCI.DAG; 18311 EVT VT = N->getValueType(0); 18312 SDLoc DL(N); 18313 18314 // MVETrunc(Undef, Undef) -> Undef 18315 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); })) 18316 return DAG.getUNDEF(VT); 18317 18318 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc 18319 if (N->getNumOperands() == 2 && 18320 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC && 18321 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC) 18322 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0), 18323 N->getOperand(0).getOperand(1), 18324 N->getOperand(1).getOperand(0), 18325 N->getOperand(1).getOperand(1)); 18326 18327 // MVETrunc(shuffle, shuffle) -> VMOVN 18328 if (N->getNumOperands() == 2 && 18329 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && 18330 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) { 18331 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode()); 18332 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode()); 18333 18334 if (S0->getOperand(0) == S1->getOperand(0) && 18335 S0->getOperand(1) == S1->getOperand(1)) { 18336 // Construct complete shuffle mask 18337 SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end()); 18338 Mask.append(S1->getMask().begin(), S1->getMask().end()); 18339 18340 if (isVMOVNTruncMask(Mask, VT, false)) 18341 return DAG.getNode( 18342 ARMISD::VMOVN, DL, VT, 18343 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), 18344 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), 18345 DAG.getConstant(1, DL, MVT::i32)); 18346 if (isVMOVNTruncMask(Mask, VT, true)) 18347 return DAG.getNode( 18348 ARMISD::VMOVN, DL, VT, 18349 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), 18350 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), 18351 DAG.getConstant(1, DL, MVT::i32)); 18352 } 18353 } 18354 18355 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the 18356 // truncate to a buildvector to allow the generic optimisations to kick in. 18357 if (all_of(N->ops(), [](SDValue Op) { 18358 return Op.getOpcode() == ISD::BUILD_VECTOR || 18359 Op.getOpcode() == ISD::VECTOR_SHUFFLE || 18360 (Op.getOpcode() == ISD::BITCAST && 18361 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR); 18362 })) { 18363 SmallVector<SDValue, 8> Extracts; 18364 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) { 18365 SDValue O = N->getOperand(Op); 18366 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) { 18367 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O, 18368 DAG.getConstant(i, DL, MVT::i32)); 18369 Extracts.push_back(Ext); 18370 } 18371 } 18372 return DAG.getBuildVector(VT, DL, Extracts); 18373 } 18374 18375 // If we are late in the legalization process and nothing has optimised 18376 // the trunc to anything better, lower it to a stack store and reload, 18377 // performing the truncation whilst keeping the lanes in the correct order: 18378 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack; 18379 if (!DCI.isAfterLegalizeDAG()) 18380 return SDValue(); 18381 18382 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4)); 18383 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); 18384 int NumIns = N->getNumOperands(); 18385 assert((NumIns == 2 || NumIns == 4) && 18386 "Expected 2 or 4 inputs to an MVETrunc"); 18387 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 18388 if (N->getNumOperands() == 4) 18389 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext()); 18390 18391 SmallVector<SDValue> Chains; 18392 for (int I = 0; I < NumIns; I++) { 18393 SDValue Ptr = DAG.getNode( 18394 ISD::ADD, DL, StackPtr.getValueType(), StackPtr, 18395 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType())); 18396 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack( 18397 DAG.getMachineFunction(), SPFI, I * 16 / NumIns); 18398 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I), 18399 Ptr, MPI, StoreVT, Align(4)); 18400 Chains.push_back(Ch); 18401 } 18402 18403 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 18404 MachinePointerInfo MPI = 18405 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0); 18406 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4)); 18407 } 18408 18409 // Take a MVEEXT(load x) and split that into (extload x, extload x+8) 18410 static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, 18411 SelectionDAG &DAG) { 18412 SDValue N0 = N->getOperand(0); 18413 LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode()); 18414 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed()) 18415 return SDValue(); 18416 18417 EVT FromVT = LD->getMemoryVT(); 18418 EVT ToVT = N->getValueType(0); 18419 if (!ToVT.isVector()) 18420 return SDValue(); 18421 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2); 18422 EVT ToEltVT = ToVT.getVectorElementType(); 18423 EVT FromEltVT = FromVT.getVectorElementType(); 18424 18425 unsigned NumElements = 0; 18426 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) 18427 NumElements = 4; 18428 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) 18429 NumElements = 8; 18430 assert(NumElements != 0); 18431 18432 ISD::LoadExtType NewExtType = 18433 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 18434 if (LD->getExtensionType() != ISD::NON_EXTLOAD && 18435 LD->getExtensionType() != ISD::EXTLOAD && 18436 LD->getExtensionType() != NewExtType) 18437 return SDValue(); 18438 18439 LLVMContext &C = *DAG.getContext(); 18440 SDLoc DL(LD); 18441 // Details about the old load 18442 SDValue Ch = LD->getChain(); 18443 SDValue BasePtr = LD->getBasePtr(); 18444 Align Alignment = LD->getOriginalAlign(); 18445 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 18446 AAMDNodes AAInfo = LD->getAAInfo(); 18447 18448 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 18449 EVT NewFromVT = EVT::getVectorVT( 18450 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); 18451 EVT NewToVT = EVT::getVectorVT( 18452 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); 18453 18454 SmallVector<SDValue, 4> Loads; 18455 SmallVector<SDValue, 4> Chains; 18456 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 18457 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; 18458 SDValue NewPtr = 18459 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 18460 18461 SDValue NewLoad = 18462 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 18463 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 18464 Alignment, MMOFlags, AAInfo); 18465 Loads.push_back(NewLoad); 18466 Chains.push_back(SDValue(NewLoad.getNode(), 1)); 18467 } 18468 18469 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 18470 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 18471 return DAG.getMergeValues(Loads, DL); 18472 } 18473 18474 // Perform combines for MVEEXT. If it has not be optimized to anything better 18475 // before lowering, it gets converted to stack store and extloads performing the 18476 // extend whilst still keeping the same lane ordering. 18477 SDValue ARMTargetLowering::PerformMVEExtCombine( 18478 SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { 18479 SelectionDAG &DAG = DCI.DAG; 18480 EVT VT = N->getValueType(0); 18481 SDLoc DL(N); 18482 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements"); 18483 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type"); 18484 18485 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( 18486 *DAG.getContext()); 18487 auto Extend = [&](SDValue V) { 18488 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V); 18489 return N->getOpcode() == ARMISD::MVESEXT 18490 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT, 18491 DAG.getValueType(ExtVT)) 18492 : DAG.getZeroExtendInReg(VVT, DL, ExtVT); 18493 }; 18494 18495 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP) 18496 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) { 18497 SDValue Ext = Extend(N->getOperand(0)); 18498 return DAG.getMergeValues({Ext, Ext}, DL); 18499 } 18500 18501 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG 18502 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) { 18503 ArrayRef<int> Mask = SVN->getMask(); 18504 assert(Mask.size() == 2 * VT.getVectorNumElements()); 18505 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements()); 18506 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16; 18507 SDValue Op0 = SVN->getOperand(0); 18508 SDValue Op1 = SVN->getOperand(1); 18509 18510 auto CheckInregMask = [&](int Start, int Offset) { 18511 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx) 18512 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset) 18513 return false; 18514 return true; 18515 }; 18516 SDValue V0 = SDValue(N, 0); 18517 SDValue V1 = SDValue(N, 1); 18518 if (CheckInregMask(0, 0)) 18519 V0 = Extend(Op0); 18520 else if (CheckInregMask(0, 1)) 18521 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0)); 18522 else if (CheckInregMask(0, Mask.size())) 18523 V0 = Extend(Op1); 18524 else if (CheckInregMask(0, Mask.size() + 1)) 18525 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1)); 18526 18527 if (CheckInregMask(VT.getVectorNumElements(), Mask.size())) 18528 V1 = Extend(Op1); 18529 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1)) 18530 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1)); 18531 else if (CheckInregMask(VT.getVectorNumElements(), 0)) 18532 V1 = Extend(Op0); 18533 else if (CheckInregMask(VT.getVectorNumElements(), 1)) 18534 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0)); 18535 18536 if (V0.getNode() != N || V1.getNode() != N) 18537 return DAG.getMergeValues({V0, V1}, DL); 18538 } 18539 18540 // MVEEXT(load) -> extload, extload 18541 if (N->getOperand(0)->getOpcode() == ISD::LOAD) 18542 if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG)) 18543 return L; 18544 18545 if (!DCI.isAfterLegalizeDAG()) 18546 return SDValue(); 18547 18548 // Lower to a stack store and reload: 18549 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8; 18550 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4)); 18551 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); 18552 int NumOuts = N->getNumValues(); 18553 assert((NumOuts == 2 || NumOuts == 4) && 18554 "Expected 2 or 4 outputs to an MVEEXT"); 18555 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( 18556 *DAG.getContext()); 18557 if (N->getNumOperands() == 4) 18558 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext()); 18559 18560 MachinePointerInfo MPI = 18561 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0); 18562 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0), 18563 StackPtr, MPI, Align(4)); 18564 18565 SmallVector<SDValue> Loads; 18566 for (int I = 0; I < NumOuts; I++) { 18567 SDValue Ptr = DAG.getNode( 18568 ISD::ADD, DL, StackPtr.getValueType(), StackPtr, 18569 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType())); 18570 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack( 18571 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts); 18572 SDValue Load = DAG.getExtLoad( 18573 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, 18574 VT, Chain, Ptr, MPI, LoadVT, Align(4)); 18575 Loads.push_back(Load); 18576 } 18577 18578 return DAG.getMergeValues(Loads, DL); 18579 } 18580 18581 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 18582 DAGCombinerInfo &DCI) const { 18583 switch (N->getOpcode()) { 18584 default: break; 18585 case ISD::SELECT_CC: 18586 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); 18587 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); 18588 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget); 18589 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 18590 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 18591 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 18592 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 18593 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); 18594 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 18595 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 18596 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 18597 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 18598 case ISD::BRCOND: 18599 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); 18600 case ARMISD::ADDC: 18601 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 18602 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 18603 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG); 18604 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 18605 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 18606 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI); 18607 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG); 18608 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); 18609 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 18610 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 18611 case ISD::EXTRACT_VECTOR_ELT: 18612 return PerformExtractEltCombine(N, DCI, Subtarget); 18613 case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG); 18614 case ISD::INSERT_SUBVECTOR: return PerformInsertSubvectorCombine(N, DCI); 18615 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 18616 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget); 18617 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget); 18618 case ISD::FP_TO_SINT: 18619 case ISD::FP_TO_UINT: 18620 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 18621 case ISD::FADD: 18622 return PerformFAddVSelectCombine(N, DCI.DAG, Subtarget); 18623 case ISD::FDIV: 18624 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 18625 case ISD::INTRINSIC_WO_CHAIN: 18626 return PerformIntrinsicCombine(N, DCI); 18627 case ISD::SHL: 18628 case ISD::SRA: 18629 case ISD::SRL: 18630 return PerformShiftCombine(N, DCI, Subtarget); 18631 case ISD::SIGN_EXTEND: 18632 case ISD::ZERO_EXTEND: 18633 case ISD::ANY_EXTEND: 18634 return PerformExtendCombine(N, DCI.DAG, Subtarget); 18635 case ISD::FP_EXTEND: 18636 return PerformFPExtendCombine(N, DCI.DAG, Subtarget); 18637 case ISD::SMIN: 18638 case ISD::UMIN: 18639 case ISD::SMAX: 18640 case ISD::UMAX: 18641 return PerformMinMaxCombine(N, DCI.DAG, Subtarget); 18642 case ARMISD::CMOV: 18643 return PerformCMOVCombine(N, DCI.DAG); 18644 case ARMISD::BRCOND: 18645 return PerformBRCONDCombine(N, DCI.DAG); 18646 case ARMISD::CMPZ: 18647 return PerformCMPZCombine(N, DCI.DAG); 18648 case ARMISD::CSINC: 18649 case ARMISD::CSINV: 18650 case ARMISD::CSNEG: 18651 return PerformCSETCombine(N, DCI.DAG); 18652 case ISD::LOAD: 18653 return PerformLOADCombine(N, DCI, Subtarget); 18654 case ARMISD::VLD1DUP: 18655 case ARMISD::VLD2DUP: 18656 case ARMISD::VLD3DUP: 18657 case ARMISD::VLD4DUP: 18658 return PerformVLDCombine(N, DCI); 18659 case ARMISD::BUILD_VECTOR: 18660 return PerformARMBUILD_VECTORCombine(N, DCI); 18661 case ISD::BITCAST: 18662 return PerformBITCASTCombine(N, DCI, Subtarget); 18663 case ARMISD::PREDICATE_CAST: 18664 return PerformPREDICATE_CASTCombine(N, DCI); 18665 case ARMISD::VECTOR_REG_CAST: 18666 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget); 18667 case ARMISD::MVETRUNC: 18668 return PerformMVETruncCombine(N, DCI); 18669 case ARMISD::MVESEXT: 18670 case ARMISD::MVEZEXT: 18671 return PerformMVEExtCombine(N, DCI); 18672 case ARMISD::VCMP: 18673 return PerformVCMPCombine(N, DCI.DAG, Subtarget); 18674 case ISD::VECREDUCE_ADD: 18675 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); 18676 case ARMISD::VMOVN: 18677 return PerformVMOVNCombine(N, DCI); 18678 case ARMISD::VQMOVNs: 18679 case ARMISD::VQMOVNu: 18680 return PerformVQMOVNCombine(N, DCI); 18681 case ARMISD::ASRL: 18682 case ARMISD::LSRL: 18683 case ARMISD::LSLL: 18684 return PerformLongShiftCombine(N, DCI.DAG); 18685 case ARMISD::SMULWB: { 18686 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 18687 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 18688 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 18689 return SDValue(); 18690 break; 18691 } 18692 case ARMISD::SMULWT: { 18693 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 18694 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 18695 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 18696 return SDValue(); 18697 break; 18698 } 18699 case ARMISD::SMLALBB: 18700 case ARMISD::QADD16b: 18701 case ARMISD::QSUB16b: 18702 case ARMISD::UQADD16b: 18703 case ARMISD::UQSUB16b: { 18704 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 18705 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 18706 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 18707 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 18708 return SDValue(); 18709 break; 18710 } 18711 case ARMISD::SMLALBT: { 18712 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 18713 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 18714 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 18715 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 18716 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 18717 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 18718 return SDValue(); 18719 break; 18720 } 18721 case ARMISD::SMLALTB: { 18722 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 18723 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 18724 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 18725 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 18726 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 18727 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 18728 return SDValue(); 18729 break; 18730 } 18731 case ARMISD::SMLALTT: { 18732 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 18733 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 18734 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 18735 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 18736 return SDValue(); 18737 break; 18738 } 18739 case ARMISD::QADD8b: 18740 case ARMISD::QSUB8b: 18741 case ARMISD::UQADD8b: 18742 case ARMISD::UQSUB8b: { 18743 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 18744 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); 18745 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 18746 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 18747 return SDValue(); 18748 break; 18749 } 18750 case ISD::INTRINSIC_VOID: 18751 case ISD::INTRINSIC_W_CHAIN: 18752 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 18753 case Intrinsic::arm_neon_vld1: 18754 case Intrinsic::arm_neon_vld1x2: 18755 case Intrinsic::arm_neon_vld1x3: 18756 case Intrinsic::arm_neon_vld1x4: 18757 case Intrinsic::arm_neon_vld2: 18758 case Intrinsic::arm_neon_vld3: 18759 case Intrinsic::arm_neon_vld4: 18760 case Intrinsic::arm_neon_vld2lane: 18761 case Intrinsic::arm_neon_vld3lane: 18762 case Intrinsic::arm_neon_vld4lane: 18763 case Intrinsic::arm_neon_vld2dup: 18764 case Intrinsic::arm_neon_vld3dup: 18765 case Intrinsic::arm_neon_vld4dup: 18766 case Intrinsic::arm_neon_vst1: 18767 case Intrinsic::arm_neon_vst1x2: 18768 case Intrinsic::arm_neon_vst1x3: 18769 case Intrinsic::arm_neon_vst1x4: 18770 case Intrinsic::arm_neon_vst2: 18771 case Intrinsic::arm_neon_vst3: 18772 case Intrinsic::arm_neon_vst4: 18773 case Intrinsic::arm_neon_vst2lane: 18774 case Intrinsic::arm_neon_vst3lane: 18775 case Intrinsic::arm_neon_vst4lane: 18776 return PerformVLDCombine(N, DCI); 18777 case Intrinsic::arm_mve_vld2q: 18778 case Intrinsic::arm_mve_vld4q: 18779 case Intrinsic::arm_mve_vst2q: 18780 case Intrinsic::arm_mve_vst4q: 18781 return PerformMVEVLDCombine(N, DCI); 18782 default: break; 18783 } 18784 break; 18785 } 18786 return SDValue(); 18787 } 18788 18789 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 18790 EVT VT) const { 18791 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 18792 } 18793 18794 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, 18795 Align Alignment, 18796 MachineMemOperand::Flags, 18797 bool *Fast) const { 18798 // Depends what it gets converted into if the type is weird. 18799 if (!VT.isSimple()) 18800 return false; 18801 18802 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus 18803 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 18804 auto Ty = VT.getSimpleVT().SimpleTy; 18805 18806 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { 18807 // Unaligned access can use (for example) LRDB, LRDH, LDR 18808 if (AllowsUnaligned) { 18809 if (Fast) 18810 *Fast = Subtarget->hasV7Ops(); 18811 return true; 18812 } 18813 } 18814 18815 if (Ty == MVT::f64 || Ty == MVT::v2f64) { 18816 // For any little-endian targets with neon, we can support unaligned ld/st 18817 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 18818 // A big-endian target may also explicitly support unaligned accesses 18819 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 18820 if (Fast) 18821 *Fast = true; 18822 return true; 18823 } 18824 } 18825 18826 if (!Subtarget->hasMVEIntegerOps()) 18827 return false; 18828 18829 // These are for predicates 18830 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 || 18831 Ty == MVT::v2i1)) { 18832 if (Fast) 18833 *Fast = true; 18834 return true; 18835 } 18836 18837 // These are for truncated stores/narrowing loads. They are fine so long as 18838 // the alignment is at least the size of the item being loaded 18839 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && 18840 Alignment >= VT.getScalarSizeInBits() / 8) { 18841 if (Fast) 18842 *Fast = true; 18843 return true; 18844 } 18845 18846 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and 18847 // VSTRW.U32 all store the vector register in exactly the same format, and 18848 // differ only in the range of their immediate offset field and the required 18849 // alignment. So there is always a store that can be used, regardless of 18850 // actual type. 18851 // 18852 // For big endian, that is not the case. But can still emit a (VSTRB.U8; 18853 // VREV64.8) pair and get the same effect. This will likely be better than 18854 // aligning the vector through the stack. 18855 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || 18856 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || 18857 Ty == MVT::v2f64) { 18858 if (Fast) 18859 *Fast = true; 18860 return true; 18861 } 18862 18863 return false; 18864 } 18865 18866 18867 EVT ARMTargetLowering::getOptimalMemOpType( 18868 const MemOp &Op, const AttributeList &FuncAttributes) const { 18869 // See if we can use NEON instructions for this... 18870 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && 18871 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { 18872 bool Fast; 18873 if (Op.size() >= 16 && 18874 (Op.isAligned(Align(16)) || 18875 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1), 18876 MachineMemOperand::MONone, &Fast) && 18877 Fast))) { 18878 return MVT::v2f64; 18879 } else if (Op.size() >= 8 && 18880 (Op.isAligned(Align(8)) || 18881 (allowsMisalignedMemoryAccesses( 18882 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) && 18883 Fast))) { 18884 return MVT::f64; 18885 } 18886 } 18887 18888 // Let the target-independent logic figure it out. 18889 return MVT::Other; 18890 } 18891 18892 // 64-bit integers are split into their high and low parts and held in two 18893 // different registers, so the trunc is free since the low register can just 18894 // be used. 18895 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 18896 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 18897 return false; 18898 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 18899 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 18900 return (SrcBits == 64 && DestBits == 32); 18901 } 18902 18903 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 18904 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 18905 !DstVT.isInteger()) 18906 return false; 18907 unsigned SrcBits = SrcVT.getSizeInBits(); 18908 unsigned DestBits = DstVT.getSizeInBits(); 18909 return (SrcBits == 64 && DestBits == 32); 18910 } 18911 18912 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 18913 if (Val.getOpcode() != ISD::LOAD) 18914 return false; 18915 18916 EVT VT1 = Val.getValueType(); 18917 if (!VT1.isSimple() || !VT1.isInteger() || 18918 !VT2.isSimple() || !VT2.isInteger()) 18919 return false; 18920 18921 switch (VT1.getSimpleVT().SimpleTy) { 18922 default: break; 18923 case MVT::i1: 18924 case MVT::i8: 18925 case MVT::i16: 18926 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 18927 return true; 18928 } 18929 18930 return false; 18931 } 18932 18933 bool ARMTargetLowering::isFNegFree(EVT VT) const { 18934 if (!VT.isSimple()) 18935 return false; 18936 18937 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 18938 // negate values directly (fneg is free). So, we don't want to let the DAG 18939 // combiner rewrite fneg into xors and some other instructions. For f16 and 18940 // FullFP16 argument passing, some bitcast nodes may be introduced, 18941 // triggering this DAG combine rewrite, so we are avoiding that with this. 18942 switch (VT.getSimpleVT().SimpleTy) { 18943 default: break; 18944 case MVT::f16: 18945 return Subtarget->hasFullFP16(); 18946 } 18947 18948 return false; 18949 } 18950 18951 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 18952 /// of the vector elements. 18953 static bool areExtractExts(Value *Ext1, Value *Ext2) { 18954 auto areExtDoubled = [](Instruction *Ext) { 18955 return Ext->getType()->getScalarSizeInBits() == 18956 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 18957 }; 18958 18959 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 18960 !match(Ext2, m_ZExtOrSExt(m_Value())) || 18961 !areExtDoubled(cast<Instruction>(Ext1)) || 18962 !areExtDoubled(cast<Instruction>(Ext2))) 18963 return false; 18964 18965 return true; 18966 } 18967 18968 /// Check if sinking \p I's operands to I's basic block is profitable, because 18969 /// the operands can be folded into a target instruction, e.g. 18970 /// sext/zext can be folded into vsubl. 18971 bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 18972 SmallVectorImpl<Use *> &Ops) const { 18973 if (!I->getType()->isVectorTy()) 18974 return false; 18975 18976 if (Subtarget->hasNEON()) { 18977 switch (I->getOpcode()) { 18978 case Instruction::Sub: 18979 case Instruction::Add: { 18980 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 18981 return false; 18982 Ops.push_back(&I->getOperandUse(0)); 18983 Ops.push_back(&I->getOperandUse(1)); 18984 return true; 18985 } 18986 default: 18987 return false; 18988 } 18989 } 18990 18991 if (!Subtarget->hasMVEIntegerOps()) 18992 return false; 18993 18994 auto IsFMSMul = [&](Instruction *I) { 18995 if (!I->hasOneUse()) 18996 return false; 18997 auto *Sub = cast<Instruction>(*I->users().begin()); 18998 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; 18999 }; 19000 auto IsFMS = [&](Instruction *I) { 19001 if (match(I->getOperand(0), m_FNeg(m_Value())) || 19002 match(I->getOperand(1), m_FNeg(m_Value()))) 19003 return true; 19004 return false; 19005 }; 19006 19007 auto IsSinker = [&](Instruction *I, int Operand) { 19008 switch (I->getOpcode()) { 19009 case Instruction::Add: 19010 case Instruction::Mul: 19011 case Instruction::FAdd: 19012 case Instruction::ICmp: 19013 case Instruction::FCmp: 19014 return true; 19015 case Instruction::FMul: 19016 return !IsFMSMul(I); 19017 case Instruction::Sub: 19018 case Instruction::FSub: 19019 case Instruction::Shl: 19020 case Instruction::LShr: 19021 case Instruction::AShr: 19022 return Operand == 1; 19023 case Instruction::Call: 19024 if (auto *II = dyn_cast<IntrinsicInst>(I)) { 19025 switch (II->getIntrinsicID()) { 19026 case Intrinsic::fma: 19027 return !IsFMS(I); 19028 case Intrinsic::sadd_sat: 19029 case Intrinsic::uadd_sat: 19030 case Intrinsic::arm_mve_add_predicated: 19031 case Intrinsic::arm_mve_mul_predicated: 19032 case Intrinsic::arm_mve_qadd_predicated: 19033 case Intrinsic::arm_mve_vhadd: 19034 case Intrinsic::arm_mve_hadd_predicated: 19035 case Intrinsic::arm_mve_vqdmull: 19036 case Intrinsic::arm_mve_vqdmull_predicated: 19037 case Intrinsic::arm_mve_vqdmulh: 19038 case Intrinsic::arm_mve_qdmulh_predicated: 19039 case Intrinsic::arm_mve_vqrdmulh: 19040 case Intrinsic::arm_mve_qrdmulh_predicated: 19041 case Intrinsic::arm_mve_fma_predicated: 19042 return true; 19043 case Intrinsic::ssub_sat: 19044 case Intrinsic::usub_sat: 19045 case Intrinsic::arm_mve_sub_predicated: 19046 case Intrinsic::arm_mve_qsub_predicated: 19047 case Intrinsic::arm_mve_hsub_predicated: 19048 case Intrinsic::arm_mve_vhsub: 19049 return Operand == 1; 19050 default: 19051 return false; 19052 } 19053 } 19054 return false; 19055 default: 19056 return false; 19057 } 19058 }; 19059 19060 for (auto OpIdx : enumerate(I->operands())) { 19061 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get()); 19062 // Make sure we are not already sinking this operand 19063 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) 19064 continue; 19065 19066 Instruction *Shuffle = Op; 19067 if (Shuffle->getOpcode() == Instruction::BitCast) 19068 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0)); 19069 // We are looking for a splat that can be sunk. 19070 if (!Shuffle || 19071 !match(Shuffle, m_Shuffle( 19072 m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), 19073 m_Undef(), m_ZeroMask()))) 19074 continue; 19075 if (!IsSinker(I, OpIdx.index())) 19076 continue; 19077 19078 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 19079 // and vector registers 19080 for (Use &U : Op->uses()) { 19081 Instruction *Insn = cast<Instruction>(U.getUser()); 19082 if (!IsSinker(Insn, U.getOperandNo())) 19083 return false; 19084 } 19085 19086 Ops.push_back(&Shuffle->getOperandUse(0)); 19087 if (Shuffle != Op) 19088 Ops.push_back(&Op->getOperandUse(0)); 19089 Ops.push_back(&OpIdx.value()); 19090 } 19091 return true; 19092 } 19093 19094 Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { 19095 if (!Subtarget->hasMVEIntegerOps()) 19096 return nullptr; 19097 Type *SVIType = SVI->getType(); 19098 Type *ScalarType = SVIType->getScalarType(); 19099 19100 if (ScalarType->isFloatTy()) 19101 return Type::getInt32Ty(SVIType->getContext()); 19102 if (ScalarType->isHalfTy()) 19103 return Type::getInt16Ty(SVIType->getContext()); 19104 return nullptr; 19105 } 19106 19107 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 19108 EVT VT = ExtVal.getValueType(); 19109 19110 if (!isTypeLegal(VT)) 19111 return false; 19112 19113 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { 19114 if (Ld->isExpandingLoad()) 19115 return false; 19116 } 19117 19118 if (Subtarget->hasMVEIntegerOps()) 19119 return true; 19120 19121 // Don't create a loadext if we can fold the extension into a wide/long 19122 // instruction. 19123 // If there's more than one user instruction, the loadext is desirable no 19124 // matter what. There can be two uses by the same instruction. 19125 if (ExtVal->use_empty() || 19126 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 19127 return true; 19128 19129 SDNode *U = *ExtVal->use_begin(); 19130 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 19131 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) 19132 return false; 19133 19134 return true; 19135 } 19136 19137 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 19138 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 19139 return false; 19140 19141 if (!isTypeLegal(EVT::getEVT(Ty1))) 19142 return false; 19143 19144 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 19145 19146 // Assuming the caller doesn't have a zeroext or signext return parameter, 19147 // truncation all the way down to i1 is valid. 19148 return true; 19149 } 19150 19151 InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 19152 const AddrMode &AM, 19153 Type *Ty, 19154 unsigned AS) const { 19155 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 19156 if (Subtarget->hasFPAO()) 19157 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 19158 return 0; 19159 } 19160 return -1; 19161 } 19162 19163 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster 19164 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be 19165 /// expanded to FMAs when this method returns true, otherwise fmuladd is 19166 /// expanded to fmul + fadd. 19167 /// 19168 /// ARM supports both fused and unfused multiply-add operations; we already 19169 /// lower a pair of fmul and fadd to the latter so it's not clear that there 19170 /// would be a gain or that the gain would be worthwhile enough to risk 19171 /// correctness bugs. 19172 /// 19173 /// For MVE, we set this to true as it helps simplify the need for some 19174 /// patterns (and we don't have the non-fused floating point instruction). 19175 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 19176 EVT VT) const { 19177 if (!VT.isSimple()) 19178 return false; 19179 19180 switch (VT.getSimpleVT().SimpleTy) { 19181 case MVT::v4f32: 19182 case MVT::v8f16: 19183 return Subtarget->hasMVEFloatOps(); 19184 case MVT::f16: 19185 return Subtarget->useFPVFMx16(); 19186 case MVT::f32: 19187 return Subtarget->useFPVFMx(); 19188 case MVT::f64: 19189 return Subtarget->useFPVFMx64(); 19190 default: 19191 break; 19192 } 19193 19194 return false; 19195 } 19196 19197 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 19198 if (V < 0) 19199 return false; 19200 19201 unsigned Scale = 1; 19202 switch (VT.getSimpleVT().SimpleTy) { 19203 case MVT::i1: 19204 case MVT::i8: 19205 // Scale == 1; 19206 break; 19207 case MVT::i16: 19208 // Scale == 2; 19209 Scale = 2; 19210 break; 19211 default: 19212 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR 19213 // Scale == 4; 19214 Scale = 4; 19215 break; 19216 } 19217 19218 if ((V & (Scale - 1)) != 0) 19219 return false; 19220 return isUInt<5>(V / Scale); 19221 } 19222 19223 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 19224 const ARMSubtarget *Subtarget) { 19225 if (!VT.isInteger() && !VT.isFloatingPoint()) 19226 return false; 19227 if (VT.isVector() && Subtarget->hasNEON()) 19228 return false; 19229 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && 19230 !Subtarget->hasMVEFloatOps()) 19231 return false; 19232 19233 bool IsNeg = false; 19234 if (V < 0) { 19235 IsNeg = true; 19236 V = -V; 19237 } 19238 19239 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U); 19240 19241 // MVE: size * imm7 19242 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { 19243 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { 19244 case MVT::i32: 19245 case MVT::f32: 19246 return isShiftedUInt<7,2>(V); 19247 case MVT::i16: 19248 case MVT::f16: 19249 return isShiftedUInt<7,1>(V); 19250 case MVT::i8: 19251 return isUInt<7>(V); 19252 default: 19253 return false; 19254 } 19255 } 19256 19257 // half VLDR: 2 * imm8 19258 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) 19259 return isShiftedUInt<8, 1>(V); 19260 // VLDR and LDRD: 4 * imm8 19261 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) 19262 return isShiftedUInt<8, 2>(V); 19263 19264 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 19265 // + imm12 or - imm8 19266 if (IsNeg) 19267 return isUInt<8>(V); 19268 return isUInt<12>(V); 19269 } 19270 19271 return false; 19272 } 19273 19274 /// isLegalAddressImmediate - Return true if the integer value can be used 19275 /// as the offset of the target addressing mode for load / store of the 19276 /// given type. 19277 static bool isLegalAddressImmediate(int64_t V, EVT VT, 19278 const ARMSubtarget *Subtarget) { 19279 if (V == 0) 19280 return true; 19281 19282 if (!VT.isSimple()) 19283 return false; 19284 19285 if (Subtarget->isThumb1Only()) 19286 return isLegalT1AddressImmediate(V, VT); 19287 else if (Subtarget->isThumb2()) 19288 return isLegalT2AddressImmediate(V, VT, Subtarget); 19289 19290 // ARM mode. 19291 if (V < 0) 19292 V = - V; 19293 switch (VT.getSimpleVT().SimpleTy) { 19294 default: return false; 19295 case MVT::i1: 19296 case MVT::i8: 19297 case MVT::i32: 19298 // +- imm12 19299 return isUInt<12>(V); 19300 case MVT::i16: 19301 // +- imm8 19302 return isUInt<8>(V); 19303 case MVT::f32: 19304 case MVT::f64: 19305 if (!Subtarget->hasVFP2Base()) // FIXME: NEON? 19306 return false; 19307 return isShiftedUInt<8, 2>(V); 19308 } 19309 } 19310 19311 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 19312 EVT VT) const { 19313 int Scale = AM.Scale; 19314 if (Scale < 0) 19315 return false; 19316 19317 switch (VT.getSimpleVT().SimpleTy) { 19318 default: return false; 19319 case MVT::i1: 19320 case MVT::i8: 19321 case MVT::i16: 19322 case MVT::i32: 19323 if (Scale == 1) 19324 return true; 19325 // r + r << imm 19326 Scale = Scale & ~1; 19327 return Scale == 2 || Scale == 4 || Scale == 8; 19328 case MVT::i64: 19329 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 19330 // version in Thumb mode. 19331 // r + r 19332 if (Scale == 1) 19333 return true; 19334 // r * 2 (this can be lowered to r + r). 19335 if (!AM.HasBaseReg && Scale == 2) 19336 return true; 19337 return false; 19338 case MVT::isVoid: 19339 // Note, we allow "void" uses (basically, uses that aren't loads or 19340 // stores), because arm allows folding a scale into many arithmetic 19341 // operations. This should be made more precise and revisited later. 19342 19343 // Allow r << imm, but the imm has to be a multiple of two. 19344 if (Scale & 1) return false; 19345 return isPowerOf2_32(Scale); 19346 } 19347 } 19348 19349 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 19350 EVT VT) const { 19351 const int Scale = AM.Scale; 19352 19353 // Negative scales are not supported in Thumb1. 19354 if (Scale < 0) 19355 return false; 19356 19357 // Thumb1 addressing modes do not support register scaling excepting the 19358 // following cases: 19359 // 1. Scale == 1 means no scaling. 19360 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 19361 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 19362 } 19363 19364 /// isLegalAddressingMode - Return true if the addressing mode represented 19365 /// by AM is legal for this target, for a load/store of the specified type. 19366 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 19367 const AddrMode &AM, Type *Ty, 19368 unsigned AS, Instruction *I) const { 19369 EVT VT = getValueType(DL, Ty, true); 19370 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 19371 return false; 19372 19373 // Can never fold addr of global into load/store. 19374 if (AM.BaseGV) 19375 return false; 19376 19377 switch (AM.Scale) { 19378 case 0: // no scale reg, must be "r+i" or "r", or "i". 19379 break; 19380 default: 19381 // ARM doesn't support any R+R*scale+imm addr modes. 19382 if (AM.BaseOffs) 19383 return false; 19384 19385 if (!VT.isSimple()) 19386 return false; 19387 19388 if (Subtarget->isThumb1Only()) 19389 return isLegalT1ScaledAddressingMode(AM, VT); 19390 19391 if (Subtarget->isThumb2()) 19392 return isLegalT2ScaledAddressingMode(AM, VT); 19393 19394 int Scale = AM.Scale; 19395 switch (VT.getSimpleVT().SimpleTy) { 19396 default: return false; 19397 case MVT::i1: 19398 case MVT::i8: 19399 case MVT::i32: 19400 if (Scale < 0) Scale = -Scale; 19401 if (Scale == 1) 19402 return true; 19403 // r + r << imm 19404 return isPowerOf2_32(Scale & ~1); 19405 case MVT::i16: 19406 case MVT::i64: 19407 // r +/- r 19408 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 19409 return true; 19410 // r * 2 (this can be lowered to r + r). 19411 if (!AM.HasBaseReg && Scale == 2) 19412 return true; 19413 return false; 19414 19415 case MVT::isVoid: 19416 // Note, we allow "void" uses (basically, uses that aren't loads or 19417 // stores), because arm allows folding a scale into many arithmetic 19418 // operations. This should be made more precise and revisited later. 19419 19420 // Allow r << imm, but the imm has to be a multiple of two. 19421 if (Scale & 1) return false; 19422 return isPowerOf2_32(Scale); 19423 } 19424 } 19425 return true; 19426 } 19427 19428 /// isLegalICmpImmediate - Return true if the specified immediate is legal 19429 /// icmp immediate, that is the target has icmp instructions which can compare 19430 /// a register against the immediate without having to materialize the 19431 /// immediate into a register. 19432 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 19433 // Thumb2 and ARM modes can use cmn for negative immediates. 19434 if (!Subtarget->isThumb()) 19435 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 19436 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 19437 if (Subtarget->isThumb2()) 19438 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 19439 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 19440 // Thumb1 doesn't have cmn, and only 8-bit immediates. 19441 return Imm >= 0 && Imm <= 255; 19442 } 19443 19444 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 19445 /// *or sub* immediate, that is the target has add or sub instructions which can 19446 /// add a register with the immediate without having to materialize the 19447 /// immediate into a register. 19448 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 19449 // Same encoding for add/sub, just flip the sign. 19450 int64_t AbsImm = std::abs(Imm); 19451 if (!Subtarget->isThumb()) 19452 return ARM_AM::getSOImmVal(AbsImm) != -1; 19453 if (Subtarget->isThumb2()) 19454 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 19455 // Thumb1 only has 8-bit unsigned immediate. 19456 return AbsImm >= 0 && AbsImm <= 255; 19457 } 19458 19459 // Return false to prevent folding 19460 // (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine, 19461 // if the folding leads to worse code. 19462 bool ARMTargetLowering::isMulAddWithConstProfitable(SDValue AddNode, 19463 SDValue ConstNode) const { 19464 // Let the DAGCombiner decide for vector types and large types. 19465 const EVT VT = AddNode.getValueType(); 19466 if (VT.isVector() || VT.getScalarSizeInBits() > 32) 19467 return true; 19468 19469 // It is worse if c0 is legal add immediate, while c1*c0 is not 19470 // and has to be composed by at least two instructions. 19471 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1)); 19472 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode); 19473 const int64_t C0 = C0Node->getSExtValue(); 19474 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue(); 19475 if (!isLegalAddImmediate(C0) || isLegalAddImmediate(CA.getSExtValue())) 19476 return true; 19477 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1) 19478 return false; 19479 19480 // Default to true and let the DAGCombiner decide. 19481 return true; 19482 } 19483 19484 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 19485 bool isSEXTLoad, SDValue &Base, 19486 SDValue &Offset, bool &isInc, 19487 SelectionDAG &DAG) { 19488 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 19489 return false; 19490 19491 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 19492 // AddressingMode 3 19493 Base = Ptr->getOperand(0); 19494 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 19495 int RHSC = (int)RHS->getZExtValue(); 19496 if (RHSC < 0 && RHSC > -256) { 19497 assert(Ptr->getOpcode() == ISD::ADD); 19498 isInc = false; 19499 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19500 return true; 19501 } 19502 } 19503 isInc = (Ptr->getOpcode() == ISD::ADD); 19504 Offset = Ptr->getOperand(1); 19505 return true; 19506 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 19507 // AddressingMode 2 19508 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 19509 int RHSC = (int)RHS->getZExtValue(); 19510 if (RHSC < 0 && RHSC > -0x1000) { 19511 assert(Ptr->getOpcode() == ISD::ADD); 19512 isInc = false; 19513 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19514 Base = Ptr->getOperand(0); 19515 return true; 19516 } 19517 } 19518 19519 if (Ptr->getOpcode() == ISD::ADD) { 19520 isInc = true; 19521 ARM_AM::ShiftOpc ShOpcVal= 19522 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 19523 if (ShOpcVal != ARM_AM::no_shift) { 19524 Base = Ptr->getOperand(1); 19525 Offset = Ptr->getOperand(0); 19526 } else { 19527 Base = Ptr->getOperand(0); 19528 Offset = Ptr->getOperand(1); 19529 } 19530 return true; 19531 } 19532 19533 isInc = (Ptr->getOpcode() == ISD::ADD); 19534 Base = Ptr->getOperand(0); 19535 Offset = Ptr->getOperand(1); 19536 return true; 19537 } 19538 19539 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 19540 return false; 19541 } 19542 19543 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 19544 bool isSEXTLoad, SDValue &Base, 19545 SDValue &Offset, bool &isInc, 19546 SelectionDAG &DAG) { 19547 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 19548 return false; 19549 19550 Base = Ptr->getOperand(0); 19551 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 19552 int RHSC = (int)RHS->getZExtValue(); 19553 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 19554 assert(Ptr->getOpcode() == ISD::ADD); 19555 isInc = false; 19556 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19557 return true; 19558 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 19559 isInc = Ptr->getOpcode() == ISD::ADD; 19560 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19561 return true; 19562 } 19563 } 19564 19565 return false; 19566 } 19567 19568 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, 19569 bool isSEXTLoad, bool IsMasked, bool isLE, 19570 SDValue &Base, SDValue &Offset, 19571 bool &isInc, SelectionDAG &DAG) { 19572 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 19573 return false; 19574 if (!isa<ConstantSDNode>(Ptr->getOperand(1))) 19575 return false; 19576 19577 // We allow LE non-masked loads to change the type (for example use a vldrb.8 19578 // as opposed to a vldrw.32). This can allow extra addressing modes or 19579 // alignments for what is otherwise an equivalent instruction. 19580 bool CanChangeType = isLE && !IsMasked; 19581 19582 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); 19583 int RHSC = (int)RHS->getZExtValue(); 19584 19585 auto IsInRange = [&](int RHSC, int Limit, int Scale) { 19586 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { 19587 assert(Ptr->getOpcode() == ISD::ADD); 19588 isInc = false; 19589 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19590 return true; 19591 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { 19592 isInc = Ptr->getOpcode() == ISD::ADD; 19593 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 19594 return true; 19595 } 19596 return false; 19597 }; 19598 19599 // Try to find a matching instruction based on s/zext, Alignment, Offset and 19600 // (in BE/masked) type. 19601 Base = Ptr->getOperand(0); 19602 if (VT == MVT::v4i16) { 19603 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2)) 19604 return true; 19605 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { 19606 if (IsInRange(RHSC, 0x80, 1)) 19607 return true; 19608 } else if (Alignment >= 4 && 19609 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && 19610 IsInRange(RHSC, 0x80, 4)) 19611 return true; 19612 else if (Alignment >= 2 && 19613 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && 19614 IsInRange(RHSC, 0x80, 2)) 19615 return true; 19616 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) 19617 return true; 19618 return false; 19619 } 19620 19621 /// getPreIndexedAddressParts - returns true by value, base pointer and 19622 /// offset pointer and addressing mode by reference if the node's address 19623 /// can be legally represented as pre-indexed load / store address. 19624 bool 19625 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 19626 SDValue &Offset, 19627 ISD::MemIndexedMode &AM, 19628 SelectionDAG &DAG) const { 19629 if (Subtarget->isThumb1Only()) 19630 return false; 19631 19632 EVT VT; 19633 SDValue Ptr; 19634 Align Alignment; 19635 bool isSEXTLoad = false; 19636 bool IsMasked = false; 19637 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 19638 Ptr = LD->getBasePtr(); 19639 VT = LD->getMemoryVT(); 19640 Alignment = LD->getAlign(); 19641 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 19642 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 19643 Ptr = ST->getBasePtr(); 19644 VT = ST->getMemoryVT(); 19645 Alignment = ST->getAlign(); 19646 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 19647 Ptr = LD->getBasePtr(); 19648 VT = LD->getMemoryVT(); 19649 Alignment = LD->getAlign(); 19650 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 19651 IsMasked = true; 19652 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 19653 Ptr = ST->getBasePtr(); 19654 VT = ST->getMemoryVT(); 19655 Alignment = ST->getAlign(); 19656 IsMasked = true; 19657 } else 19658 return false; 19659 19660 bool isInc; 19661 bool isLegal = false; 19662 if (VT.isVector()) 19663 isLegal = Subtarget->hasMVEIntegerOps() && 19664 getMVEIndexedAddressParts( 19665 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked, 19666 Subtarget->isLittle(), Base, Offset, isInc, DAG); 19667 else { 19668 if (Subtarget->isThumb2()) 19669 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 19670 Offset, isInc, DAG); 19671 else 19672 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 19673 Offset, isInc, DAG); 19674 } 19675 if (!isLegal) 19676 return false; 19677 19678 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 19679 return true; 19680 } 19681 19682 /// getPostIndexedAddressParts - returns true by value, base pointer and 19683 /// offset pointer and addressing mode by reference if this node can be 19684 /// combined with a load / store to form a post-indexed load / store. 19685 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 19686 SDValue &Base, 19687 SDValue &Offset, 19688 ISD::MemIndexedMode &AM, 19689 SelectionDAG &DAG) const { 19690 EVT VT; 19691 SDValue Ptr; 19692 Align Alignment; 19693 bool isSEXTLoad = false, isNonExt; 19694 bool IsMasked = false; 19695 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 19696 VT = LD->getMemoryVT(); 19697 Ptr = LD->getBasePtr(); 19698 Alignment = LD->getAlign(); 19699 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 19700 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 19701 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 19702 VT = ST->getMemoryVT(); 19703 Ptr = ST->getBasePtr(); 19704 Alignment = ST->getAlign(); 19705 isNonExt = !ST->isTruncatingStore(); 19706 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 19707 VT = LD->getMemoryVT(); 19708 Ptr = LD->getBasePtr(); 19709 Alignment = LD->getAlign(); 19710 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 19711 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 19712 IsMasked = true; 19713 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 19714 VT = ST->getMemoryVT(); 19715 Ptr = ST->getBasePtr(); 19716 Alignment = ST->getAlign(); 19717 isNonExt = !ST->isTruncatingStore(); 19718 IsMasked = true; 19719 } else 19720 return false; 19721 19722 if (Subtarget->isThumb1Only()) { 19723 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 19724 // must be non-extending/truncating, i32, with an offset of 4. 19725 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 19726 if (Op->getOpcode() != ISD::ADD || !isNonExt) 19727 return false; 19728 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 19729 if (!RHS || RHS->getZExtValue() != 4) 19730 return false; 19731 if (Alignment < Align(4)) 19732 return false; 19733 19734 Offset = Op->getOperand(1); 19735 Base = Op->getOperand(0); 19736 AM = ISD::POST_INC; 19737 return true; 19738 } 19739 19740 bool isInc; 19741 bool isLegal = false; 19742 if (VT.isVector()) 19743 isLegal = Subtarget->hasMVEIntegerOps() && 19744 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked, 19745 Subtarget->isLittle(), Base, Offset, 19746 isInc, DAG); 19747 else { 19748 if (Subtarget->isThumb2()) 19749 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 19750 isInc, DAG); 19751 else 19752 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 19753 isInc, DAG); 19754 } 19755 if (!isLegal) 19756 return false; 19757 19758 if (Ptr != Base) { 19759 // Swap base ptr and offset to catch more post-index load / store when 19760 // it's legal. In Thumb2 mode, offset must be an immediate. 19761 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 19762 !Subtarget->isThumb2()) 19763 std::swap(Base, Offset); 19764 19765 // Post-indexed load / store update the base pointer. 19766 if (Ptr != Base) 19767 return false; 19768 } 19769 19770 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 19771 return true; 19772 } 19773 19774 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 19775 KnownBits &Known, 19776 const APInt &DemandedElts, 19777 const SelectionDAG &DAG, 19778 unsigned Depth) const { 19779 unsigned BitWidth = Known.getBitWidth(); 19780 Known.resetAll(); 19781 switch (Op.getOpcode()) { 19782 default: break; 19783 case ARMISD::ADDC: 19784 case ARMISD::ADDE: 19785 case ARMISD::SUBC: 19786 case ARMISD::SUBE: 19787 // Special cases when we convert a carry to a boolean. 19788 if (Op.getResNo() == 0) { 19789 SDValue LHS = Op.getOperand(0); 19790 SDValue RHS = Op.getOperand(1); 19791 // (ADDE 0, 0, C) will give us a single bit. 19792 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 19793 isNullConstant(RHS)) { 19794 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 19795 return; 19796 } 19797 } 19798 break; 19799 case ARMISD::CMOV: { 19800 // Bits are known zero/one if known on the LHS and RHS. 19801 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 19802 if (Known.isUnknown()) 19803 return; 19804 19805 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 19806 Known = KnownBits::commonBits(Known, KnownRHS); 19807 return; 19808 } 19809 case ISD::INTRINSIC_W_CHAIN: { 19810 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 19811 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 19812 switch (IntID) { 19813 default: return; 19814 case Intrinsic::arm_ldaex: 19815 case Intrinsic::arm_ldrex: { 19816 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 19817 unsigned MemBits = VT.getScalarSizeInBits(); 19818 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 19819 return; 19820 } 19821 } 19822 } 19823 case ARMISD::BFI: { 19824 // Conservatively, we can recurse down the first operand 19825 // and just mask out all affected bits. 19826 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 19827 19828 // The operand to BFI is already a mask suitable for removing the bits it 19829 // sets. 19830 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 19831 const APInt &Mask = CI->getAPIntValue(); 19832 Known.Zero &= Mask; 19833 Known.One &= Mask; 19834 return; 19835 } 19836 case ARMISD::VGETLANEs: 19837 case ARMISD::VGETLANEu: { 19838 const SDValue &SrcSV = Op.getOperand(0); 19839 EVT VecVT = SrcSV.getValueType(); 19840 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 19841 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 19842 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 19843 assert(Pos->getAPIntValue().ult(NumSrcElts) && 19844 "VGETLANE index out of bounds"); 19845 unsigned Idx = Pos->getZExtValue(); 19846 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 19847 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 19848 19849 EVT VT = Op.getValueType(); 19850 const unsigned DstSz = VT.getScalarSizeInBits(); 19851 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 19852 (void)SrcSz; 19853 assert(SrcSz == Known.getBitWidth()); 19854 assert(DstSz > SrcSz); 19855 if (Op.getOpcode() == ARMISD::VGETLANEs) 19856 Known = Known.sext(DstSz); 19857 else { 19858 Known = Known.zext(DstSz); 19859 } 19860 assert(DstSz == Known.getBitWidth()); 19861 break; 19862 } 19863 case ARMISD::VMOVrh: { 19864 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 19865 assert(KnownOp.getBitWidth() == 16); 19866 Known = KnownOp.zext(32); 19867 break; 19868 } 19869 case ARMISD::CSINC: 19870 case ARMISD::CSINV: 19871 case ARMISD::CSNEG: { 19872 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 19873 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 19874 19875 // The result is either: 19876 // CSINC: KnownOp0 or KnownOp1 + 1 19877 // CSINV: KnownOp0 or ~KnownOp1 19878 // CSNEG: KnownOp0 or KnownOp1 * -1 19879 if (Op.getOpcode() == ARMISD::CSINC) 19880 KnownOp1 = KnownBits::computeForAddSub( 19881 true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1))); 19882 else if (Op.getOpcode() == ARMISD::CSINV) 19883 std::swap(KnownOp1.Zero, KnownOp1.One); 19884 else if (Op.getOpcode() == ARMISD::CSNEG) 19885 KnownOp1 = KnownBits::mul( 19886 KnownOp1, KnownBits::makeConstant(APInt(32, -1))); 19887 19888 Known = KnownBits::commonBits(KnownOp0, KnownOp1); 19889 break; 19890 } 19891 } 19892 } 19893 19894 bool ARMTargetLowering::targetShrinkDemandedConstant( 19895 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, 19896 TargetLoweringOpt &TLO) const { 19897 // Delay optimization, so we don't have to deal with illegal types, or block 19898 // optimizations. 19899 if (!TLO.LegalOps) 19900 return false; 19901 19902 // Only optimize AND for now. 19903 if (Op.getOpcode() != ISD::AND) 19904 return false; 19905 19906 EVT VT = Op.getValueType(); 19907 19908 // Ignore vectors. 19909 if (VT.isVector()) 19910 return false; 19911 19912 assert(VT == MVT::i32 && "Unexpected integer type"); 19913 19914 // Make sure the RHS really is a constant. 19915 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 19916 if (!C) 19917 return false; 19918 19919 unsigned Mask = C->getZExtValue(); 19920 19921 unsigned Demanded = DemandedBits.getZExtValue(); 19922 unsigned ShrunkMask = Mask & Demanded; 19923 unsigned ExpandedMask = Mask | ~Demanded; 19924 19925 // If the mask is all zeros, let the target-independent code replace the 19926 // result with zero. 19927 if (ShrunkMask == 0) 19928 return false; 19929 19930 // If the mask is all ones, erase the AND. (Currently, the target-independent 19931 // code won't do this, so we have to do it explicitly to avoid an infinite 19932 // loop in obscure cases.) 19933 if (ExpandedMask == ~0U) 19934 return TLO.CombineTo(Op, Op.getOperand(0)); 19935 19936 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 19937 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 19938 }; 19939 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 19940 if (NewMask == Mask) 19941 return true; 19942 SDLoc DL(Op); 19943 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 19944 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 19945 return TLO.CombineTo(Op, NewOp); 19946 }; 19947 19948 // Prefer uxtb mask. 19949 if (IsLegalMask(0xFF)) 19950 return UseMask(0xFF); 19951 19952 // Prefer uxth mask. 19953 if (IsLegalMask(0xFFFF)) 19954 return UseMask(0xFFFF); 19955 19956 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 19957 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 19958 if (ShrunkMask < 256) 19959 return UseMask(ShrunkMask); 19960 19961 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 19962 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 19963 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 19964 return UseMask(ExpandedMask); 19965 19966 // Potential improvements: 19967 // 19968 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 19969 // We could try to prefer Thumb1 immediates which can be lowered to a 19970 // two-instruction sequence. 19971 // We could try to recognize more legal ARM/Thumb2 immediates here. 19972 19973 return false; 19974 } 19975 19976 bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( 19977 SDValue Op, const APInt &OriginalDemandedBits, 19978 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, 19979 unsigned Depth) const { 19980 unsigned Opc = Op.getOpcode(); 19981 19982 switch (Opc) { 19983 case ARMISD::ASRL: 19984 case ARMISD::LSRL: { 19985 // If this is result 0 and the other result is unused, see if the demand 19986 // bits allow us to shrink this long shift into a standard small shift in 19987 // the opposite direction. 19988 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) && 19989 isa<ConstantSDNode>(Op->getOperand(2))) { 19990 unsigned ShAmt = Op->getConstantOperandVal(2); 19991 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32) 19992 << (32 - ShAmt))) 19993 return TLO.CombineTo( 19994 Op, TLO.DAG.getNode( 19995 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1), 19996 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32))); 19997 } 19998 break; 19999 } 20000 case ARMISD::VBICIMM: { 20001 SDValue Op0 = Op.getOperand(0); 20002 unsigned ModImm = Op.getConstantOperandVal(1); 20003 unsigned EltBits = 0; 20004 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits); 20005 if ((OriginalDemandedBits & Mask) == 0) 20006 return TLO.CombineTo(Op, Op0); 20007 } 20008 } 20009 20010 return TargetLowering::SimplifyDemandedBitsForTargetNode( 20011 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); 20012 } 20013 20014 //===----------------------------------------------------------------------===// 20015 // ARM Inline Assembly Support 20016 //===----------------------------------------------------------------------===// 20017 20018 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 20019 // Looking for "rev" which is V6+. 20020 if (!Subtarget->hasV6Ops()) 20021 return false; 20022 20023 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); 20024 std::string AsmStr = IA->getAsmString(); 20025 SmallVector<StringRef, 4> AsmPieces; 20026 SplitString(AsmStr, AsmPieces, ";\n"); 20027 20028 switch (AsmPieces.size()) { 20029 default: return false; 20030 case 1: 20031 AsmStr = std::string(AsmPieces[0]); 20032 AsmPieces.clear(); 20033 SplitString(AsmStr, AsmPieces, " \t,"); 20034 20035 // rev $0, $1 20036 if (AsmPieces.size() == 3 && 20037 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 20038 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 20039 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 20040 if (Ty && Ty->getBitWidth() == 32) 20041 return IntrinsicLowering::LowerToByteSwap(CI); 20042 } 20043 break; 20044 } 20045 20046 return false; 20047 } 20048 20049 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 20050 // At this point, we have to lower this constraint to something else, so we 20051 // lower it to an "r" or "w". However, by doing this we will force the result 20052 // to be in register, while the X constraint is much more permissive. 20053 // 20054 // Although we are correct (we are free to emit anything, without 20055 // constraints), we might break use cases that would expect us to be more 20056 // efficient and emit something else. 20057 if (!Subtarget->hasVFP2Base()) 20058 return "r"; 20059 if (ConstraintVT.isFloatingPoint()) 20060 return "w"; 20061 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 20062 (ConstraintVT.getSizeInBits() == 64 || 20063 ConstraintVT.getSizeInBits() == 128)) 20064 return "w"; 20065 20066 return "r"; 20067 } 20068 20069 /// getConstraintType - Given a constraint letter, return the type of 20070 /// constraint it is for this target. 20071 ARMTargetLowering::ConstraintType 20072 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 20073 unsigned S = Constraint.size(); 20074 if (S == 1) { 20075 switch (Constraint[0]) { 20076 default: break; 20077 case 'l': return C_RegisterClass; 20078 case 'w': return C_RegisterClass; 20079 case 'h': return C_RegisterClass; 20080 case 'x': return C_RegisterClass; 20081 case 't': return C_RegisterClass; 20082 case 'j': return C_Immediate; // Constant for movw. 20083 // An address with a single base register. Due to the way we 20084 // currently handle addresses it is the same as an 'r' memory constraint. 20085 case 'Q': return C_Memory; 20086 } 20087 } else if (S == 2) { 20088 switch (Constraint[0]) { 20089 default: break; 20090 case 'T': return C_RegisterClass; 20091 // All 'U+' constraints are addresses. 20092 case 'U': return C_Memory; 20093 } 20094 } 20095 return TargetLowering::getConstraintType(Constraint); 20096 } 20097 20098 /// Examine constraint type and operand type and determine a weight value. 20099 /// This object must already have been set up with the operand type 20100 /// and the current alternative constraint selected. 20101 TargetLowering::ConstraintWeight 20102 ARMTargetLowering::getSingleConstraintMatchWeight( 20103 AsmOperandInfo &info, const char *constraint) const { 20104 ConstraintWeight weight = CW_Invalid; 20105 Value *CallOperandVal = info.CallOperandVal; 20106 // If we don't have a value, we can't do a match, 20107 // but allow it at the lowest weight. 20108 if (!CallOperandVal) 20109 return CW_Default; 20110 Type *type = CallOperandVal->getType(); 20111 // Look at the constraint type. 20112 switch (*constraint) { 20113 default: 20114 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 20115 break; 20116 case 'l': 20117 if (type->isIntegerTy()) { 20118 if (Subtarget->isThumb()) 20119 weight = CW_SpecificReg; 20120 else 20121 weight = CW_Register; 20122 } 20123 break; 20124 case 'w': 20125 if (type->isFloatingPointTy()) 20126 weight = CW_Register; 20127 break; 20128 } 20129 return weight; 20130 } 20131 20132 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 20133 20134 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 20135 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 20136 switch (Constraint.size()) { 20137 case 1: 20138 // GCC ARM Constraint Letters 20139 switch (Constraint[0]) { 20140 case 'l': // Low regs or general regs. 20141 if (Subtarget->isThumb()) 20142 return RCPair(0U, &ARM::tGPRRegClass); 20143 return RCPair(0U, &ARM::GPRRegClass); 20144 case 'h': // High regs or no regs. 20145 if (Subtarget->isThumb()) 20146 return RCPair(0U, &ARM::hGPRRegClass); 20147 break; 20148 case 'r': 20149 if (Subtarget->isThumb1Only()) 20150 return RCPair(0U, &ARM::tGPRRegClass); 20151 return RCPair(0U, &ARM::GPRRegClass); 20152 case 'w': 20153 if (VT == MVT::Other) 20154 break; 20155 if (VT == MVT::f32) 20156 return RCPair(0U, &ARM::SPRRegClass); 20157 if (VT.getSizeInBits() == 64) 20158 return RCPair(0U, &ARM::DPRRegClass); 20159 if (VT.getSizeInBits() == 128) 20160 return RCPair(0U, &ARM::QPRRegClass); 20161 break; 20162 case 'x': 20163 if (VT == MVT::Other) 20164 break; 20165 if (VT == MVT::f32) 20166 return RCPair(0U, &ARM::SPR_8RegClass); 20167 if (VT.getSizeInBits() == 64) 20168 return RCPair(0U, &ARM::DPR_8RegClass); 20169 if (VT.getSizeInBits() == 128) 20170 return RCPair(0U, &ARM::QPR_8RegClass); 20171 break; 20172 case 't': 20173 if (VT == MVT::Other) 20174 break; 20175 if (VT == MVT::f32 || VT == MVT::i32) 20176 return RCPair(0U, &ARM::SPRRegClass); 20177 if (VT.getSizeInBits() == 64) 20178 return RCPair(0U, &ARM::DPR_VFP2RegClass); 20179 if (VT.getSizeInBits() == 128) 20180 return RCPair(0U, &ARM::QPR_VFP2RegClass); 20181 break; 20182 } 20183 break; 20184 20185 case 2: 20186 if (Constraint[0] == 'T') { 20187 switch (Constraint[1]) { 20188 default: 20189 break; 20190 case 'e': 20191 return RCPair(0U, &ARM::tGPREvenRegClass); 20192 case 'o': 20193 return RCPair(0U, &ARM::tGPROddRegClass); 20194 } 20195 } 20196 break; 20197 20198 default: 20199 break; 20200 } 20201 20202 if (StringRef("{cc}").equals_insensitive(Constraint)) 20203 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 20204 20205 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 20206 } 20207 20208 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 20209 /// vector. If it is invalid, don't add anything to Ops. 20210 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 20211 std::string &Constraint, 20212 std::vector<SDValue>&Ops, 20213 SelectionDAG &DAG) const { 20214 SDValue Result; 20215 20216 // Currently only support length 1 constraints. 20217 if (Constraint.length() != 1) return; 20218 20219 char ConstraintLetter = Constraint[0]; 20220 switch (ConstraintLetter) { 20221 default: break; 20222 case 'j': 20223 case 'I': case 'J': case 'K': case 'L': 20224 case 'M': case 'N': case 'O': 20225 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 20226 if (!C) 20227 return; 20228 20229 int64_t CVal64 = C->getSExtValue(); 20230 int CVal = (int) CVal64; 20231 // None of these constraints allow values larger than 32 bits. Check 20232 // that the value fits in an int. 20233 if (CVal != CVal64) 20234 return; 20235 20236 switch (ConstraintLetter) { 20237 case 'j': 20238 // Constant suitable for movw, must be between 0 and 20239 // 65535. 20240 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) 20241 if (CVal >= 0 && CVal <= 65535) 20242 break; 20243 return; 20244 case 'I': 20245 if (Subtarget->isThumb1Only()) { 20246 // This must be a constant between 0 and 255, for ADD 20247 // immediates. 20248 if (CVal >= 0 && CVal <= 255) 20249 break; 20250 } else if (Subtarget->isThumb2()) { 20251 // A constant that can be used as an immediate value in a 20252 // data-processing instruction. 20253 if (ARM_AM::getT2SOImmVal(CVal) != -1) 20254 break; 20255 } else { 20256 // A constant that can be used as an immediate value in a 20257 // data-processing instruction. 20258 if (ARM_AM::getSOImmVal(CVal) != -1) 20259 break; 20260 } 20261 return; 20262 20263 case 'J': 20264 if (Subtarget->isThumb1Only()) { 20265 // This must be a constant between -255 and -1, for negated ADD 20266 // immediates. This can be used in GCC with an "n" modifier that 20267 // prints the negated value, for use with SUB instructions. It is 20268 // not useful otherwise but is implemented for compatibility. 20269 if (CVal >= -255 && CVal <= -1) 20270 break; 20271 } else { 20272 // This must be a constant between -4095 and 4095. It is not clear 20273 // what this constraint is intended for. Implemented for 20274 // compatibility with GCC. 20275 if (CVal >= -4095 && CVal <= 4095) 20276 break; 20277 } 20278 return; 20279 20280 case 'K': 20281 if (Subtarget->isThumb1Only()) { 20282 // A 32-bit value where only one byte has a nonzero value. Exclude 20283 // zero to match GCC. This constraint is used by GCC internally for 20284 // constants that can be loaded with a move/shift combination. 20285 // It is not useful otherwise but is implemented for compatibility. 20286 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 20287 break; 20288 } else if (Subtarget->isThumb2()) { 20289 // A constant whose bitwise inverse can be used as an immediate 20290 // value in a data-processing instruction. This can be used in GCC 20291 // with a "B" modifier that prints the inverted value, for use with 20292 // BIC and MVN instructions. It is not useful otherwise but is 20293 // implemented for compatibility. 20294 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 20295 break; 20296 } else { 20297 // A constant whose bitwise inverse can be used as an immediate 20298 // value in a data-processing instruction. This can be used in GCC 20299 // with a "B" modifier that prints the inverted value, for use with 20300 // BIC and MVN instructions. It is not useful otherwise but is 20301 // implemented for compatibility. 20302 if (ARM_AM::getSOImmVal(~CVal) != -1) 20303 break; 20304 } 20305 return; 20306 20307 case 'L': 20308 if (Subtarget->isThumb1Only()) { 20309 // This must be a constant between -7 and 7, 20310 // for 3-operand ADD/SUB immediate instructions. 20311 if (CVal >= -7 && CVal < 7) 20312 break; 20313 } else if (Subtarget->isThumb2()) { 20314 // A constant whose negation can be used as an immediate value in a 20315 // data-processing instruction. This can be used in GCC with an "n" 20316 // modifier that prints the negated value, for use with SUB 20317 // instructions. It is not useful otherwise but is implemented for 20318 // compatibility. 20319 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 20320 break; 20321 } else { 20322 // A constant whose negation can be used as an immediate value in a 20323 // data-processing instruction. This can be used in GCC with an "n" 20324 // modifier that prints the negated value, for use with SUB 20325 // instructions. It is not useful otherwise but is implemented for 20326 // compatibility. 20327 if (ARM_AM::getSOImmVal(-CVal) != -1) 20328 break; 20329 } 20330 return; 20331 20332 case 'M': 20333 if (Subtarget->isThumb1Only()) { 20334 // This must be a multiple of 4 between 0 and 1020, for 20335 // ADD sp + immediate. 20336 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 20337 break; 20338 } else { 20339 // A power of two or a constant between 0 and 32. This is used in 20340 // GCC for the shift amount on shifted register operands, but it is 20341 // useful in general for any shift amounts. 20342 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 20343 break; 20344 } 20345 return; 20346 20347 case 'N': 20348 if (Subtarget->isThumb1Only()) { 20349 // This must be a constant between 0 and 31, for shift amounts. 20350 if (CVal >= 0 && CVal <= 31) 20351 break; 20352 } 20353 return; 20354 20355 case 'O': 20356 if (Subtarget->isThumb1Only()) { 20357 // This must be a multiple of 4 between -508 and 508, for 20358 // ADD/SUB sp = sp + immediate. 20359 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 20360 break; 20361 } 20362 return; 20363 } 20364 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 20365 break; 20366 } 20367 20368 if (Result.getNode()) { 20369 Ops.push_back(Result); 20370 return; 20371 } 20372 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 20373 } 20374 20375 static RTLIB::Libcall getDivRemLibcall( 20376 const SDNode *N, MVT::SimpleValueType SVT) { 20377 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 20378 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 20379 "Unhandled Opcode in getDivRemLibcall"); 20380 bool isSigned = N->getOpcode() == ISD::SDIVREM || 20381 N->getOpcode() == ISD::SREM; 20382 RTLIB::Libcall LC; 20383 switch (SVT) { 20384 default: llvm_unreachable("Unexpected request for libcall!"); 20385 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 20386 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 20387 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 20388 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 20389 } 20390 return LC; 20391 } 20392 20393 static TargetLowering::ArgListTy getDivRemArgList( 20394 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 20395 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 20396 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 20397 "Unhandled Opcode in getDivRemArgList"); 20398 bool isSigned = N->getOpcode() == ISD::SDIVREM || 20399 N->getOpcode() == ISD::SREM; 20400 TargetLowering::ArgListTy Args; 20401 TargetLowering::ArgListEntry Entry; 20402 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 20403 EVT ArgVT = N->getOperand(i).getValueType(); 20404 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 20405 Entry.Node = N->getOperand(i); 20406 Entry.Ty = ArgTy; 20407 Entry.IsSExt = isSigned; 20408 Entry.IsZExt = !isSigned; 20409 Args.push_back(Entry); 20410 } 20411 if (Subtarget->isTargetWindows() && Args.size() >= 2) 20412 std::swap(Args[0], Args[1]); 20413 return Args; 20414 } 20415 20416 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 20417 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 20418 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 20419 Subtarget->isTargetWindows()) && 20420 "Register-based DivRem lowering only"); 20421 unsigned Opcode = Op->getOpcode(); 20422 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 20423 "Invalid opcode for Div/Rem lowering"); 20424 bool isSigned = (Opcode == ISD::SDIVREM); 20425 EVT VT = Op->getValueType(0); 20426 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 20427 SDLoc dl(Op); 20428 20429 // If the target has hardware divide, use divide + multiply + subtract: 20430 // div = a / b 20431 // rem = a - b * div 20432 // return {div, rem} 20433 // This should be lowered into UDIV/SDIV + MLS later on. 20434 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 20435 : Subtarget->hasDivideInARMMode(); 20436 if (hasDivide && Op->getValueType(0).isSimple() && 20437 Op->getSimpleValueType(0) == MVT::i32) { 20438 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 20439 const SDValue Dividend = Op->getOperand(0); 20440 const SDValue Divisor = Op->getOperand(1); 20441 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 20442 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 20443 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 20444 20445 SDValue Values[2] = {Div, Rem}; 20446 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 20447 } 20448 20449 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 20450 VT.getSimpleVT().SimpleTy); 20451 SDValue InChain = DAG.getEntryNode(); 20452 20453 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 20454 DAG.getContext(), 20455 Subtarget); 20456 20457 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 20458 getPointerTy(DAG.getDataLayout())); 20459 20460 Type *RetTy = StructType::get(Ty, Ty); 20461 20462 if (Subtarget->isTargetWindows()) 20463 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 20464 20465 TargetLowering::CallLoweringInfo CLI(DAG); 20466 CLI.setDebugLoc(dl).setChain(InChain) 20467 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 20468 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 20469 20470 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 20471 return CallInfo.first; 20472 } 20473 20474 // Lowers REM using divmod helpers 20475 // see RTABI section 4.2/4.3 20476 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 20477 // Build return types (div and rem) 20478 std::vector<Type*> RetTyParams; 20479 Type *RetTyElement; 20480 20481 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 20482 default: llvm_unreachable("Unexpected request for libcall!"); 20483 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 20484 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 20485 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 20486 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 20487 } 20488 20489 RetTyParams.push_back(RetTyElement); 20490 RetTyParams.push_back(RetTyElement); 20491 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 20492 Type *RetTy = StructType::get(*DAG.getContext(), ret); 20493 20494 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 20495 SimpleTy); 20496 SDValue InChain = DAG.getEntryNode(); 20497 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 20498 Subtarget); 20499 bool isSigned = N->getOpcode() == ISD::SREM; 20500 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 20501 getPointerTy(DAG.getDataLayout())); 20502 20503 if (Subtarget->isTargetWindows()) 20504 InChain = WinDBZCheckDenominator(DAG, N, InChain); 20505 20506 // Lower call 20507 CallLoweringInfo CLI(DAG); 20508 CLI.setChain(InChain) 20509 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 20510 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 20511 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 20512 20513 // Return second (rem) result operand (first contains div) 20514 SDNode *ResNode = CallResult.first.getNode(); 20515 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 20516 return ResNode->getOperand(1); 20517 } 20518 20519 SDValue 20520 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 20521 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 20522 SDLoc DL(Op); 20523 20524 // Get the inputs. 20525 SDValue Chain = Op.getOperand(0); 20526 SDValue Size = Op.getOperand(1); 20527 20528 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 20529 "no-stack-arg-probe")) { 20530 MaybeAlign Align = 20531 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); 20532 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 20533 Chain = SP.getValue(1); 20534 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 20535 if (Align) 20536 SP = 20537 DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 20538 DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32)); 20539 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 20540 SDValue Ops[2] = { SP, Chain }; 20541 return DAG.getMergeValues(Ops, DL); 20542 } 20543 20544 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 20545 DAG.getConstant(2, DL, MVT::i32)); 20546 20547 SDValue Flag; 20548 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 20549 Flag = Chain.getValue(1); 20550 20551 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 20552 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 20553 20554 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 20555 Chain = NewSP.getValue(1); 20556 20557 SDValue Ops[2] = { NewSP, Chain }; 20558 return DAG.getMergeValues(Ops, DL); 20559 } 20560 20561 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 20562 bool IsStrict = Op->isStrictFPOpcode(); 20563 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 20564 const unsigned DstSz = Op.getValueType().getSizeInBits(); 20565 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); 20566 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && 20567 "Unexpected type for custom-lowering FP_EXTEND"); 20568 20569 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 20570 "With both FP DP and 16, any FP conversion is legal!"); 20571 20572 assert(!(DstSz == 32 && Subtarget->hasFP16()) && 20573 "With FP16, 16 to 32 conversion is legal!"); 20574 20575 // Converting from 32 -> 64 is valid if we have FP64. 20576 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) { 20577 // FIXME: Remove this when we have strict fp instruction selection patterns 20578 if (IsStrict) { 20579 SDLoc Loc(Op); 20580 SDValue Result = DAG.getNode(ISD::FP_EXTEND, 20581 Loc, Op.getValueType(), SrcVal); 20582 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 20583 } 20584 return Op; 20585 } 20586 20587 // Either we are converting from 16 -> 64, without FP16 and/or 20588 // FP.double-precision or without Armv8-fp. So we must do it in two 20589 // steps. 20590 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 20591 // without FP16. So we must do a function call. 20592 SDLoc Loc(Op); 20593 RTLIB::Libcall LC; 20594 MakeLibCallOptions CallOptions; 20595 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 20596 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) { 20597 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64()); 20598 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32); 20599 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64); 20600 if (Supported) { 20601 if (IsStrict) { 20602 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc, 20603 {DstVT, MVT::Other}, {Chain, SrcVal}); 20604 Chain = SrcVal.getValue(1); 20605 } else { 20606 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal); 20607 } 20608 } else { 20609 LC = RTLIB::getFPEXT(SrcVT, DstVT); 20610 assert(LC != RTLIB::UNKNOWN_LIBCALL && 20611 "Unexpected type for custom-lowering FP_EXTEND"); 20612 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 20613 Loc, Chain); 20614 } 20615 } 20616 20617 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal; 20618 } 20619 20620 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 20621 bool IsStrict = Op->isStrictFPOpcode(); 20622 20623 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 20624 EVT SrcVT = SrcVal.getValueType(); 20625 EVT DstVT = Op.getValueType(); 20626 const unsigned DstSz = Op.getValueType().getSizeInBits(); 20627 const unsigned SrcSz = SrcVT.getSizeInBits(); 20628 (void)DstSz; 20629 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && 20630 "Unexpected type for custom-lowering FP_ROUND"); 20631 20632 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 20633 "With both FP DP and 16, any FP conversion is legal!"); 20634 20635 SDLoc Loc(Op); 20636 20637 // Instruction from 32 -> 16 if hasFP16 is valid 20638 if (SrcSz == 32 && Subtarget->hasFP16()) 20639 return Op; 20640 20641 // Lib call from 32 -> 16 / 64 -> [32, 16] 20642 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); 20643 assert(LC != RTLIB::UNKNOWN_LIBCALL && 20644 "Unexpected type for custom-lowering FP_ROUND"); 20645 MakeLibCallOptions CallOptions; 20646 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 20647 SDValue Result; 20648 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 20649 Loc, Chain); 20650 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 20651 } 20652 20653 bool 20654 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 20655 // The ARM target isn't yet aware of offsets. 20656 return false; 20657 } 20658 20659 bool ARM::isBitFieldInvertedMask(unsigned v) { 20660 if (v == 0xffffffff) 20661 return false; 20662 20663 // there can be 1's on either or both "outsides", all the "inside" 20664 // bits must be 0's 20665 return isShiftedMask_32(~v); 20666 } 20667 20668 /// isFPImmLegal - Returns true if the target can instruction select the 20669 /// specified FP immediate natively. If false, the legalizer will 20670 /// materialize the FP immediate as a load from a constant pool. 20671 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 20672 bool ForCodeSize) const { 20673 if (!Subtarget->hasVFP3Base()) 20674 return false; 20675 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 20676 return ARM_AM::getFP16Imm(Imm) != -1; 20677 if (VT == MVT::f32 && Subtarget->hasFullFP16() && 20678 ARM_AM::getFP32FP16Imm(Imm) != -1) 20679 return true; 20680 if (VT == MVT::f32) 20681 return ARM_AM::getFP32Imm(Imm) != -1; 20682 if (VT == MVT::f64 && Subtarget->hasFP64()) 20683 return ARM_AM::getFP64Imm(Imm) != -1; 20684 return false; 20685 } 20686 20687 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 20688 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 20689 /// specified in the intrinsic calls. 20690 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 20691 const CallInst &I, 20692 MachineFunction &MF, 20693 unsigned Intrinsic) const { 20694 switch (Intrinsic) { 20695 case Intrinsic::arm_neon_vld1: 20696 case Intrinsic::arm_neon_vld2: 20697 case Intrinsic::arm_neon_vld3: 20698 case Intrinsic::arm_neon_vld4: 20699 case Intrinsic::arm_neon_vld2lane: 20700 case Intrinsic::arm_neon_vld3lane: 20701 case Intrinsic::arm_neon_vld4lane: 20702 case Intrinsic::arm_neon_vld2dup: 20703 case Intrinsic::arm_neon_vld3dup: 20704 case Intrinsic::arm_neon_vld4dup: { 20705 Info.opc = ISD::INTRINSIC_W_CHAIN; 20706 // Conservatively set memVT to the entire set of vectors loaded. 20707 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20708 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 20709 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 20710 Info.ptrVal = I.getArgOperand(0); 20711 Info.offset = 0; 20712 Value *AlignArg = I.getArgOperand(I.arg_size() - 1); 20713 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); 20714 // volatile loads with NEON intrinsics not supported 20715 Info.flags = MachineMemOperand::MOLoad; 20716 return true; 20717 } 20718 case Intrinsic::arm_neon_vld1x2: 20719 case Intrinsic::arm_neon_vld1x3: 20720 case Intrinsic::arm_neon_vld1x4: { 20721 Info.opc = ISD::INTRINSIC_W_CHAIN; 20722 // Conservatively set memVT to the entire set of vectors loaded. 20723 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20724 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 20725 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 20726 Info.ptrVal = I.getArgOperand(I.arg_size() - 1); 20727 Info.offset = 0; 20728 Info.align.reset(); 20729 // volatile loads with NEON intrinsics not supported 20730 Info.flags = MachineMemOperand::MOLoad; 20731 return true; 20732 } 20733 case Intrinsic::arm_neon_vst1: 20734 case Intrinsic::arm_neon_vst2: 20735 case Intrinsic::arm_neon_vst3: 20736 case Intrinsic::arm_neon_vst4: 20737 case Intrinsic::arm_neon_vst2lane: 20738 case Intrinsic::arm_neon_vst3lane: 20739 case Intrinsic::arm_neon_vst4lane: { 20740 Info.opc = ISD::INTRINSIC_VOID; 20741 // Conservatively set memVT to the entire set of vectors stored. 20742 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20743 unsigned NumElts = 0; 20744 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) { 20745 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 20746 if (!ArgTy->isVectorTy()) 20747 break; 20748 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 20749 } 20750 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 20751 Info.ptrVal = I.getArgOperand(0); 20752 Info.offset = 0; 20753 Value *AlignArg = I.getArgOperand(I.arg_size() - 1); 20754 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); 20755 // volatile stores with NEON intrinsics not supported 20756 Info.flags = MachineMemOperand::MOStore; 20757 return true; 20758 } 20759 case Intrinsic::arm_neon_vst1x2: 20760 case Intrinsic::arm_neon_vst1x3: 20761 case Intrinsic::arm_neon_vst1x4: { 20762 Info.opc = ISD::INTRINSIC_VOID; 20763 // Conservatively set memVT to the entire set of vectors stored. 20764 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20765 unsigned NumElts = 0; 20766 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) { 20767 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 20768 if (!ArgTy->isVectorTy()) 20769 break; 20770 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 20771 } 20772 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 20773 Info.ptrVal = I.getArgOperand(0); 20774 Info.offset = 0; 20775 Info.align.reset(); 20776 // volatile stores with NEON intrinsics not supported 20777 Info.flags = MachineMemOperand::MOStore; 20778 return true; 20779 } 20780 case Intrinsic::arm_mve_vld2q: 20781 case Intrinsic::arm_mve_vld4q: { 20782 Info.opc = ISD::INTRINSIC_W_CHAIN; 20783 // Conservatively set memVT to the entire set of vectors loaded. 20784 Type *VecTy = cast<StructType>(I.getType())->getElementType(1); 20785 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; 20786 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 20787 Info.ptrVal = I.getArgOperand(0); 20788 Info.offset = 0; 20789 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 20790 // volatile loads with MVE intrinsics not supported 20791 Info.flags = MachineMemOperand::MOLoad; 20792 return true; 20793 } 20794 case Intrinsic::arm_mve_vst2q: 20795 case Intrinsic::arm_mve_vst4q: { 20796 Info.opc = ISD::INTRINSIC_VOID; 20797 // Conservatively set memVT to the entire set of vectors stored. 20798 Type *VecTy = I.getArgOperand(1)->getType(); 20799 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4; 20800 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 20801 Info.ptrVal = I.getArgOperand(0); 20802 Info.offset = 0; 20803 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 20804 // volatile stores with MVE intrinsics not supported 20805 Info.flags = MachineMemOperand::MOStore; 20806 return true; 20807 } 20808 case Intrinsic::arm_mve_vldr_gather_base: 20809 case Intrinsic::arm_mve_vldr_gather_base_predicated: { 20810 Info.opc = ISD::INTRINSIC_W_CHAIN; 20811 Info.ptrVal = nullptr; 20812 Info.memVT = MVT::getVT(I.getType()); 20813 Info.align = Align(1); 20814 Info.flags |= MachineMemOperand::MOLoad; 20815 return true; 20816 } 20817 case Intrinsic::arm_mve_vldr_gather_base_wb: 20818 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: { 20819 Info.opc = ISD::INTRINSIC_W_CHAIN; 20820 Info.ptrVal = nullptr; 20821 Info.memVT = MVT::getVT(I.getType()->getContainedType(0)); 20822 Info.align = Align(1); 20823 Info.flags |= MachineMemOperand::MOLoad; 20824 return true; 20825 } 20826 case Intrinsic::arm_mve_vldr_gather_offset: 20827 case Intrinsic::arm_mve_vldr_gather_offset_predicated: { 20828 Info.opc = ISD::INTRINSIC_W_CHAIN; 20829 Info.ptrVal = nullptr; 20830 MVT DataVT = MVT::getVT(I.getType()); 20831 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue(); 20832 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), 20833 DataVT.getVectorNumElements()); 20834 Info.align = Align(1); 20835 Info.flags |= MachineMemOperand::MOLoad; 20836 return true; 20837 } 20838 case Intrinsic::arm_mve_vstr_scatter_base: 20839 case Intrinsic::arm_mve_vstr_scatter_base_predicated: { 20840 Info.opc = ISD::INTRINSIC_VOID; 20841 Info.ptrVal = nullptr; 20842 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); 20843 Info.align = Align(1); 20844 Info.flags |= MachineMemOperand::MOStore; 20845 return true; 20846 } 20847 case Intrinsic::arm_mve_vstr_scatter_base_wb: 20848 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: { 20849 Info.opc = ISD::INTRINSIC_W_CHAIN; 20850 Info.ptrVal = nullptr; 20851 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); 20852 Info.align = Align(1); 20853 Info.flags |= MachineMemOperand::MOStore; 20854 return true; 20855 } 20856 case Intrinsic::arm_mve_vstr_scatter_offset: 20857 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: { 20858 Info.opc = ISD::INTRINSIC_VOID; 20859 Info.ptrVal = nullptr; 20860 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType()); 20861 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); 20862 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), 20863 DataVT.getVectorNumElements()); 20864 Info.align = Align(1); 20865 Info.flags |= MachineMemOperand::MOStore; 20866 return true; 20867 } 20868 case Intrinsic::arm_ldaex: 20869 case Intrinsic::arm_ldrex: { 20870 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20871 Type *ValTy = I.getParamElementType(0); 20872 Info.opc = ISD::INTRINSIC_W_CHAIN; 20873 Info.memVT = MVT::getVT(ValTy); 20874 Info.ptrVal = I.getArgOperand(0); 20875 Info.offset = 0; 20876 Info.align = DL.getABITypeAlign(ValTy); 20877 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 20878 return true; 20879 } 20880 case Intrinsic::arm_stlex: 20881 case Intrinsic::arm_strex: { 20882 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 20883 Type *ValTy = I.getParamElementType(1); 20884 Info.opc = ISD::INTRINSIC_W_CHAIN; 20885 Info.memVT = MVT::getVT(ValTy); 20886 Info.ptrVal = I.getArgOperand(1); 20887 Info.offset = 0; 20888 Info.align = DL.getABITypeAlign(ValTy); 20889 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 20890 return true; 20891 } 20892 case Intrinsic::arm_stlexd: 20893 case Intrinsic::arm_strexd: 20894 Info.opc = ISD::INTRINSIC_W_CHAIN; 20895 Info.memVT = MVT::i64; 20896 Info.ptrVal = I.getArgOperand(2); 20897 Info.offset = 0; 20898 Info.align = Align(8); 20899 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 20900 return true; 20901 20902 case Intrinsic::arm_ldaexd: 20903 case Intrinsic::arm_ldrexd: 20904 Info.opc = ISD::INTRINSIC_W_CHAIN; 20905 Info.memVT = MVT::i64; 20906 Info.ptrVal = I.getArgOperand(0); 20907 Info.offset = 0; 20908 Info.align = Align(8); 20909 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 20910 return true; 20911 20912 default: 20913 break; 20914 } 20915 20916 return false; 20917 } 20918 20919 /// Returns true if it is beneficial to convert a load of a constant 20920 /// to just the constant itself. 20921 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 20922 Type *Ty) const { 20923 assert(Ty->isIntegerTy()); 20924 20925 unsigned Bits = Ty->getPrimitiveSizeInBits(); 20926 if (Bits == 0 || Bits > 32) 20927 return false; 20928 return true; 20929 } 20930 20931 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 20932 unsigned Index) const { 20933 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 20934 return false; 20935 20936 return (Index == 0 || Index == ResVT.getVectorNumElements()); 20937 } 20938 20939 Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, 20940 ARM_MB::MemBOpt Domain) const { 20941 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 20942 20943 // First, if the target has no DMB, see what fallback we can use. 20944 if (!Subtarget->hasDataBarrier()) { 20945 // Some ARMv6 cpus can support data barriers with an mcr instruction. 20946 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 20947 // here. 20948 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 20949 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 20950 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 20951 Builder.getInt32(0), Builder.getInt32(7), 20952 Builder.getInt32(10), Builder.getInt32(5)}; 20953 return Builder.CreateCall(MCR, args); 20954 } else { 20955 // Instead of using barriers, atomic accesses on these subtargets use 20956 // libcalls. 20957 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 20958 } 20959 } else { 20960 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 20961 // Only a full system barrier exists in the M-class architectures. 20962 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 20963 Constant *CDomain = Builder.getInt32(Domain); 20964 return Builder.CreateCall(DMB, CDomain); 20965 } 20966 } 20967 20968 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 20969 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, 20970 Instruction *Inst, 20971 AtomicOrdering Ord) const { 20972 switch (Ord) { 20973 case AtomicOrdering::NotAtomic: 20974 case AtomicOrdering::Unordered: 20975 llvm_unreachable("Invalid fence: unordered/non-atomic"); 20976 case AtomicOrdering::Monotonic: 20977 case AtomicOrdering::Acquire: 20978 return nullptr; // Nothing to do 20979 case AtomicOrdering::SequentiallyConsistent: 20980 if (!Inst->hasAtomicStore()) 20981 return nullptr; // Nothing to do 20982 LLVM_FALLTHROUGH; 20983 case AtomicOrdering::Release: 20984 case AtomicOrdering::AcquireRelease: 20985 if (Subtarget->preferISHSTBarriers()) 20986 return makeDMB(Builder, ARM_MB::ISHST); 20987 // FIXME: add a comment with a link to documentation justifying this. 20988 else 20989 return makeDMB(Builder, ARM_MB::ISH); 20990 } 20991 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 20992 } 20993 20994 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder, 20995 Instruction *Inst, 20996 AtomicOrdering Ord) const { 20997 switch (Ord) { 20998 case AtomicOrdering::NotAtomic: 20999 case AtomicOrdering::Unordered: 21000 llvm_unreachable("Invalid fence: unordered/not-atomic"); 21001 case AtomicOrdering::Monotonic: 21002 case AtomicOrdering::Release: 21003 return nullptr; // Nothing to do 21004 case AtomicOrdering::Acquire: 21005 case AtomicOrdering::AcquireRelease: 21006 case AtomicOrdering::SequentiallyConsistent: 21007 return makeDMB(Builder, ARM_MB::ISH); 21008 } 21009 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 21010 } 21011 21012 // Loads and stores less than 64-bits are already atomic; ones above that 21013 // are doomed anyway, so defer to the default libcall and blame the OS when 21014 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 21015 // anything for those. 21016 TargetLoweringBase::AtomicExpansionKind 21017 ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 21018 bool has64BitAtomicStore; 21019 if (Subtarget->isMClass()) 21020 has64BitAtomicStore = false; 21021 else if (Subtarget->isThumb()) 21022 has64BitAtomicStore = Subtarget->hasV7Ops(); 21023 else 21024 has64BitAtomicStore = Subtarget->hasV6Ops(); 21025 21026 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 21027 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand 21028 : AtomicExpansionKind::None; 21029 } 21030 21031 // Loads and stores less than 64-bits are already atomic; ones above that 21032 // are doomed anyway, so defer to the default libcall and blame the OS when 21033 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 21034 // anything for those. 21035 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 21036 // guarantee, see DDI0406C ARM architecture reference manual, 21037 // sections A8.8.72-74 LDRD) 21038 TargetLowering::AtomicExpansionKind 21039 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 21040 bool has64BitAtomicLoad; 21041 if (Subtarget->isMClass()) 21042 has64BitAtomicLoad = false; 21043 else if (Subtarget->isThumb()) 21044 has64BitAtomicLoad = Subtarget->hasV7Ops(); 21045 else 21046 has64BitAtomicLoad = Subtarget->hasV6Ops(); 21047 21048 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 21049 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly 21050 : AtomicExpansionKind::None; 21051 } 21052 21053 // For the real atomic operations, we have ldrex/strex up to 32 bits, 21054 // and up to 64 bits on the non-M profiles 21055 TargetLowering::AtomicExpansionKind 21056 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 21057 if (AI->isFloatingPointOperation()) 21058 return AtomicExpansionKind::CmpXChg; 21059 21060 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 21061 bool hasAtomicRMW; 21062 if (Subtarget->isMClass()) 21063 hasAtomicRMW = Subtarget->hasV8MBaselineOps(); 21064 else if (Subtarget->isThumb()) 21065 hasAtomicRMW = Subtarget->hasV7Ops(); 21066 else 21067 hasAtomicRMW = Subtarget->hasV6Ops(); 21068 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) { 21069 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 21070 // implement atomicrmw without spilling. If the target address is also on 21071 // the stack and close enough to the spill slot, this can lead to a 21072 // situation where the monitor always gets cleared and the atomic operation 21073 // can never succeed. So at -O0 lower this operation to a CAS loop. 21074 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 21075 return AtomicExpansionKind::CmpXChg; 21076 return AtomicExpansionKind::LLSC; 21077 } 21078 return AtomicExpansionKind::None; 21079 } 21080 21081 // Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32 21082 // bits, and up to 64 bits on the non-M profiles. 21083 TargetLowering::AtomicExpansionKind 21084 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 21085 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 21086 // implement cmpxchg without spilling. If the address being exchanged is also 21087 // on the stack and close enough to the spill slot, this can lead to a 21088 // situation where the monitor always gets cleared and the atomic operation 21089 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 21090 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits(); 21091 bool HasAtomicCmpXchg; 21092 if (Subtarget->isMClass()) 21093 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps(); 21094 else if (Subtarget->isThumb()) 21095 HasAtomicCmpXchg = Subtarget->hasV7Ops(); 21096 else 21097 HasAtomicCmpXchg = Subtarget->hasV6Ops(); 21098 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg && 21099 Size <= (Subtarget->isMClass() ? 32U : 64U)) 21100 return AtomicExpansionKind::LLSC; 21101 return AtomicExpansionKind::None; 21102 } 21103 21104 bool ARMTargetLowering::shouldInsertFencesForAtomic( 21105 const Instruction *I) const { 21106 return InsertFencesForAtomic; 21107 } 21108 21109 bool ARMTargetLowering::useLoadStackGuardNode() const { return true; } 21110 21111 void ARMTargetLowering::insertSSPDeclarations(Module &M) const { 21112 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 21113 return TargetLowering::insertSSPDeclarations(M); 21114 21115 // MSVC CRT has a global variable holding security cookie. 21116 M.getOrInsertGlobal("__security_cookie", 21117 Type::getInt8PtrTy(M.getContext())); 21118 21119 // MSVC CRT has a function to validate security cookie. 21120 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 21121 "__security_check_cookie", Type::getVoidTy(M.getContext()), 21122 Type::getInt8PtrTy(M.getContext())); 21123 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) 21124 F->addParamAttr(0, Attribute::AttrKind::InReg); 21125 } 21126 21127 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { 21128 // MSVC CRT has a global variable holding security cookie. 21129 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 21130 return M.getGlobalVariable("__security_cookie"); 21131 return TargetLowering::getSDagStackGuard(M); 21132 } 21133 21134 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { 21135 // MSVC CRT has a function to validate security cookie. 21136 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 21137 return M.getFunction("__security_check_cookie"); 21138 return TargetLowering::getSSPStackGuardCheck(M); 21139 } 21140 21141 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 21142 unsigned &Cost) const { 21143 // If we do not have NEON, vector types are not natively supported. 21144 if (!Subtarget->hasNEON()) 21145 return false; 21146 21147 // Floating point values and vector values map to the same register file. 21148 // Therefore, although we could do a store extract of a vector type, this is 21149 // better to leave at float as we have more freedom in the addressing mode for 21150 // those. 21151 if (VectorTy->isFPOrFPVectorTy()) 21152 return false; 21153 21154 // If the index is unknown at compile time, this is very expensive to lower 21155 // and it is not possible to combine the store with the extract. 21156 if (!isa<ConstantInt>(Idx)) 21157 return false; 21158 21159 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 21160 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize(); 21161 // We can do a store + vector extract on any vector that fits perfectly in a D 21162 // or Q register. 21163 if (BitWidth == 64 || BitWidth == 128) { 21164 Cost = 0; 21165 return true; 21166 } 21167 return false; 21168 } 21169 21170 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 21171 return Subtarget->hasV6T2Ops(); 21172 } 21173 21174 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 21175 return Subtarget->hasV6T2Ops(); 21176 } 21177 21178 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 21179 return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); 21180 } 21181 21182 Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, 21183 Value *Addr, 21184 AtomicOrdering Ord) const { 21185 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 21186 bool IsAcquire = isAcquireOrStronger(Ord); 21187 21188 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 21189 // intrinsic must return {i32, i32} and we have to recombine them into a 21190 // single i64 here. 21191 if (ValueTy->getPrimitiveSizeInBits() == 64) { 21192 Intrinsic::ID Int = 21193 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 21194 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 21195 21196 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 21197 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 21198 21199 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 21200 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 21201 if (!Subtarget->isLittle()) 21202 std::swap (Lo, Hi); 21203 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); 21204 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); 21205 return Builder.CreateOr( 21206 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64"); 21207 } 21208 21209 Type *Tys[] = { Addr->getType() }; 21210 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 21211 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 21212 CallInst *CI = Builder.CreateCall(Ldrex, Addr); 21213 21214 CI->addParamAttr( 21215 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy)); 21216 return Builder.CreateTruncOrBitCast(CI, ValueTy); 21217 } 21218 21219 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 21220 IRBuilderBase &Builder) const { 21221 if (!Subtarget->hasV7Ops()) 21222 return; 21223 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 21224 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 21225 } 21226 21227 Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, 21228 Value *Val, Value *Addr, 21229 AtomicOrdering Ord) const { 21230 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 21231 bool IsRelease = isReleaseOrStronger(Ord); 21232 21233 // Since the intrinsics must have legal type, the i64 intrinsics take two 21234 // parameters: "i32, i32". We must marshal Val into the appropriate form 21235 // before the call. 21236 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 21237 Intrinsic::ID Int = 21238 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 21239 Function *Strex = Intrinsic::getDeclaration(M, Int); 21240 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 21241 21242 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 21243 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 21244 if (!Subtarget->isLittle()) 21245 std::swap(Lo, Hi); 21246 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 21247 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 21248 } 21249 21250 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 21251 Type *Tys[] = { Addr->getType() }; 21252 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 21253 21254 CallInst *CI = Builder.CreateCall( 21255 Strex, {Builder.CreateZExtOrBitCast( 21256 Val, Strex->getFunctionType()->getParamType(0)), 21257 Addr}); 21258 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType, 21259 Val->getType())); 21260 return CI; 21261 } 21262 21263 21264 bool ARMTargetLowering::alignLoopsWithOptSize() const { 21265 return Subtarget->isMClass(); 21266 } 21267 21268 /// A helper function for determining the number of interleaved accesses we 21269 /// will generate when lowering accesses of the given type. 21270 unsigned 21271 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 21272 const DataLayout &DL) const { 21273 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 21274 } 21275 21276 bool ARMTargetLowering::isLegalInterleavedAccessType( 21277 unsigned Factor, FixedVectorType *VecTy, Align Alignment, 21278 const DataLayout &DL) const { 21279 21280 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 21281 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 21282 21283 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) 21284 return false; 21285 21286 // Ensure the vector doesn't have f16 elements. Even though we could do an 21287 // i16 vldN, we can't hold the f16 vectors and will end up converting via 21288 // f32. 21289 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) 21290 return false; 21291 if (Subtarget->hasMVEIntegerOps() && Factor == 3) 21292 return false; 21293 21294 // Ensure the number of vector elements is greater than 1. 21295 if (VecTy->getNumElements() < 2) 21296 return false; 21297 21298 // Ensure the element type is legal. 21299 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 21300 return false; 21301 // And the alignment if high enough under MVE. 21302 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8) 21303 return false; 21304 21305 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 21306 // 128 will be split into multiple interleaved accesses. 21307 if (Subtarget->hasNEON() && VecSize == 64) 21308 return true; 21309 return VecSize % 128 == 0; 21310 } 21311 21312 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { 21313 if (Subtarget->hasNEON()) 21314 return 4; 21315 if (Subtarget->hasMVEIntegerOps()) 21316 return MVEMaxSupportedInterleaveFactor; 21317 return TargetLoweringBase::getMaxSupportedInterleaveFactor(); 21318 } 21319 21320 /// Lower an interleaved load into a vldN intrinsic. 21321 /// 21322 /// E.g. Lower an interleaved load (Factor = 2): 21323 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 21324 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 21325 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 21326 /// 21327 /// Into: 21328 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 21329 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 21330 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 21331 bool ARMTargetLowering::lowerInterleavedLoad( 21332 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 21333 ArrayRef<unsigned> Indices, unsigned Factor) const { 21334 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 21335 "Invalid interleave factor"); 21336 assert(!Shuffles.empty() && "Empty shufflevector input"); 21337 assert(Shuffles.size() == Indices.size() && 21338 "Unmatched number of shufflevectors and indices"); 21339 21340 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); 21341 Type *EltTy = VecTy->getElementType(); 21342 21343 const DataLayout &DL = LI->getModule()->getDataLayout(); 21344 Align Alignment = LI->getAlign(); 21345 21346 // Skip if we do not have NEON and skip illegal vector types. We can 21347 // "legalize" wide vector types into multiple interleaved accesses as long as 21348 // the vector types are divisible by 128. 21349 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL)) 21350 return false; 21351 21352 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 21353 21354 // A pointer vector can not be the return type of the ldN intrinsics. Need to 21355 // load integer vectors first and then convert to pointer vectors. 21356 if (EltTy->isPointerTy()) 21357 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy); 21358 21359 IRBuilder<> Builder(LI); 21360 21361 // The base address of the load. 21362 Value *BaseAddr = LI->getPointerOperand(); 21363 21364 if (NumLoads > 1) { 21365 // If we're going to generate more than one load, reset the sub-vector type 21366 // to something legal. 21367 VecTy = FixedVectorType::get(VecTy->getElementType(), 21368 VecTy->getNumElements() / NumLoads); 21369 21370 // We will compute the pointer operand of each load from the original base 21371 // address using GEPs. Cast the base address to a pointer to the scalar 21372 // element type. 21373 BaseAddr = Builder.CreateBitCast( 21374 BaseAddr, 21375 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); 21376 } 21377 21378 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 21379 21380 auto createLoadIntrinsic = [&](Value *BaseAddr) { 21381 if (Subtarget->hasNEON()) { 21382 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 21383 Type *Tys[] = {VecTy, Int8Ptr}; 21384 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 21385 Intrinsic::arm_neon_vld3, 21386 Intrinsic::arm_neon_vld4}; 21387 Function *VldnFunc = 21388 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 21389 21390 SmallVector<Value *, 2> Ops; 21391 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 21392 Ops.push_back(Builder.getInt32(LI->getAlign().value())); 21393 21394 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 21395 } else { 21396 assert((Factor == 2 || Factor == 4) && 21397 "expected interleave factor of 2 or 4 for MVE"); 21398 Intrinsic::ID LoadInts = 21399 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; 21400 Type *VecEltTy = 21401 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()); 21402 Type *Tys[] = {VecTy, VecEltTy}; 21403 Function *VldnFunc = 21404 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); 21405 21406 SmallVector<Value *, 2> Ops; 21407 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); 21408 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 21409 } 21410 }; 21411 21412 // Holds sub-vectors extracted from the load intrinsic return values. The 21413 // sub-vectors are associated with the shufflevector instructions they will 21414 // replace. 21415 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 21416 21417 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 21418 // If we're generating more than one load, compute the base address of 21419 // subsequent loads as an offset from the previous. 21420 if (LoadCount > 0) 21421 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr, 21422 VecTy->getNumElements() * Factor); 21423 21424 CallInst *VldN = createLoadIntrinsic(BaseAddr); 21425 21426 // Replace uses of each shufflevector with the corresponding vector loaded 21427 // by ldN. 21428 for (unsigned i = 0; i < Shuffles.size(); i++) { 21429 ShuffleVectorInst *SV = Shuffles[i]; 21430 unsigned Index = Indices[i]; 21431 21432 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 21433 21434 // Convert the integer vector to pointer vector if the element is pointer. 21435 if (EltTy->isPointerTy()) 21436 SubVec = Builder.CreateIntToPtr( 21437 SubVec, 21438 FixedVectorType::get(SV->getType()->getElementType(), VecTy)); 21439 21440 SubVecs[SV].push_back(SubVec); 21441 } 21442 } 21443 21444 // Replace uses of the shufflevector instructions with the sub-vectors 21445 // returned by the load intrinsic. If a shufflevector instruction is 21446 // associated with more than one sub-vector, those sub-vectors will be 21447 // concatenated into a single wide vector. 21448 for (ShuffleVectorInst *SVI : Shuffles) { 21449 auto &SubVec = SubVecs[SVI]; 21450 auto *WideVec = 21451 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 21452 SVI->replaceAllUsesWith(WideVec); 21453 } 21454 21455 return true; 21456 } 21457 21458 /// Lower an interleaved store into a vstN intrinsic. 21459 /// 21460 /// E.g. Lower an interleaved store (Factor = 3): 21461 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 21462 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 21463 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 21464 /// 21465 /// Into: 21466 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 21467 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 21468 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 21469 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 21470 /// 21471 /// Note that the new shufflevectors will be removed and we'll only generate one 21472 /// vst3 instruction in CodeGen. 21473 /// 21474 /// Example for a more general valid mask (Factor 3). Lower: 21475 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 21476 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 21477 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 21478 /// 21479 /// Into: 21480 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 21481 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 21482 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 21483 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 21484 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 21485 ShuffleVectorInst *SVI, 21486 unsigned Factor) const { 21487 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 21488 "Invalid interleave factor"); 21489 21490 auto *VecTy = cast<FixedVectorType>(SVI->getType()); 21491 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); 21492 21493 unsigned LaneLen = VecTy->getNumElements() / Factor; 21494 Type *EltTy = VecTy->getElementType(); 21495 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); 21496 21497 const DataLayout &DL = SI->getModule()->getDataLayout(); 21498 Align Alignment = SI->getAlign(); 21499 21500 // Skip if we do not have NEON and skip illegal vector types. We can 21501 // "legalize" wide vector types into multiple interleaved accesses as long as 21502 // the vector types are divisible by 128. 21503 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) 21504 return false; 21505 21506 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 21507 21508 Value *Op0 = SVI->getOperand(0); 21509 Value *Op1 = SVI->getOperand(1); 21510 IRBuilder<> Builder(SI); 21511 21512 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 21513 // vectors to integer vectors. 21514 if (EltTy->isPointerTy()) { 21515 Type *IntTy = DL.getIntPtrType(EltTy); 21516 21517 // Convert to the corresponding integer vector. 21518 auto *IntVecTy = 21519 FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType())); 21520 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 21521 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 21522 21523 SubVecTy = FixedVectorType::get(IntTy, LaneLen); 21524 } 21525 21526 // The base address of the store. 21527 Value *BaseAddr = SI->getPointerOperand(); 21528 21529 if (NumStores > 1) { 21530 // If we're going to generate more than one store, reset the lane length 21531 // and sub-vector type to something legal. 21532 LaneLen /= NumStores; 21533 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); 21534 21535 // We will compute the pointer operand of each store from the original base 21536 // address using GEPs. Cast the base address to a pointer to the scalar 21537 // element type. 21538 BaseAddr = Builder.CreateBitCast( 21539 BaseAddr, 21540 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); 21541 } 21542 21543 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 21544 21545 auto Mask = SVI->getShuffleMask(); 21546 21547 auto createStoreIntrinsic = [&](Value *BaseAddr, 21548 SmallVectorImpl<Value *> &Shuffles) { 21549 if (Subtarget->hasNEON()) { 21550 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 21551 Intrinsic::arm_neon_vst3, 21552 Intrinsic::arm_neon_vst4}; 21553 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 21554 Type *Tys[] = {Int8Ptr, SubVecTy}; 21555 21556 Function *VstNFunc = Intrinsic::getDeclaration( 21557 SI->getModule(), StoreInts[Factor - 2], Tys); 21558 21559 SmallVector<Value *, 6> Ops; 21560 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 21561 append_range(Ops, Shuffles); 21562 Ops.push_back(Builder.getInt32(SI->getAlign().value())); 21563 Builder.CreateCall(VstNFunc, Ops); 21564 } else { 21565 assert((Factor == 2 || Factor == 4) && 21566 "expected interleave factor of 2 or 4 for MVE"); 21567 Intrinsic::ID StoreInts = 21568 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; 21569 Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo( 21570 SI->getPointerAddressSpace()); 21571 Type *Tys[] = {EltPtrTy, SubVecTy}; 21572 Function *VstNFunc = 21573 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); 21574 21575 SmallVector<Value *, 6> Ops; 21576 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); 21577 append_range(Ops, Shuffles); 21578 for (unsigned F = 0; F < Factor; F++) { 21579 Ops.push_back(Builder.getInt32(F)); 21580 Builder.CreateCall(VstNFunc, Ops); 21581 Ops.pop_back(); 21582 } 21583 } 21584 }; 21585 21586 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 21587 // If we generating more than one store, we compute the base address of 21588 // subsequent stores as an offset from the previous. 21589 if (StoreCount > 0) 21590 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), 21591 BaseAddr, LaneLen * Factor); 21592 21593 SmallVector<Value *, 4> Shuffles; 21594 21595 // Split the shufflevector operands into sub vectors for the new vstN call. 21596 for (unsigned i = 0; i < Factor; i++) { 21597 unsigned IdxI = StoreCount * LaneLen * Factor + i; 21598 if (Mask[IdxI] >= 0) { 21599 Shuffles.push_back(Builder.CreateShuffleVector( 21600 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); 21601 } else { 21602 unsigned StartMask = 0; 21603 for (unsigned j = 1; j < LaneLen; j++) { 21604 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 21605 if (Mask[IdxJ * Factor + IdxI] >= 0) { 21606 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 21607 break; 21608 } 21609 } 21610 // Note: If all elements in a chunk are undefs, StartMask=0! 21611 // Note: Filling undef gaps with random elements is ok, since 21612 // those elements were being written anyway (with undefs). 21613 // In the case of all undefs we're defaulting to using elems from 0 21614 // Note: StartMask cannot be negative, it's checked in 21615 // isReInterleaveMask 21616 Shuffles.push_back(Builder.CreateShuffleVector( 21617 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); 21618 } 21619 } 21620 21621 createStoreIntrinsic(BaseAddr, Shuffles); 21622 } 21623 return true; 21624 } 21625 21626 enum HABaseType { 21627 HA_UNKNOWN = 0, 21628 HA_FLOAT, 21629 HA_DOUBLE, 21630 HA_VECT64, 21631 HA_VECT128 21632 }; 21633 21634 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 21635 uint64_t &Members) { 21636 if (auto *ST = dyn_cast<StructType>(Ty)) { 21637 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 21638 uint64_t SubMembers = 0; 21639 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 21640 return false; 21641 Members += SubMembers; 21642 } 21643 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 21644 uint64_t SubMembers = 0; 21645 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 21646 return false; 21647 Members += SubMembers * AT->getNumElements(); 21648 } else if (Ty->isFloatTy()) { 21649 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 21650 return false; 21651 Members = 1; 21652 Base = HA_FLOAT; 21653 } else if (Ty->isDoubleTy()) { 21654 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 21655 return false; 21656 Members = 1; 21657 Base = HA_DOUBLE; 21658 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 21659 Members = 1; 21660 switch (Base) { 21661 case HA_FLOAT: 21662 case HA_DOUBLE: 21663 return false; 21664 case HA_VECT64: 21665 return VT->getPrimitiveSizeInBits().getFixedSize() == 64; 21666 case HA_VECT128: 21667 return VT->getPrimitiveSizeInBits().getFixedSize() == 128; 21668 case HA_UNKNOWN: 21669 switch (VT->getPrimitiveSizeInBits().getFixedSize()) { 21670 case 64: 21671 Base = HA_VECT64; 21672 return true; 21673 case 128: 21674 Base = HA_VECT128; 21675 return true; 21676 default: 21677 return false; 21678 } 21679 } 21680 } 21681 21682 return (Members > 0 && Members <= 4); 21683 } 21684 21685 /// Return the correct alignment for the current calling convention. 21686 Align ARMTargetLowering::getABIAlignmentForCallingConv( 21687 Type *ArgTy, const DataLayout &DL) const { 21688 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy); 21689 if (!ArgTy->isVectorTy()) 21690 return ABITypeAlign; 21691 21692 // Avoid over-aligning vector parameters. It would require realigning the 21693 // stack and waste space for no real benefit. 21694 return std::min(ABITypeAlign, DL.getStackAlignment()); 21695 } 21696 21697 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 21698 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 21699 /// passing according to AAPCS rules. 21700 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 21701 Type *Ty, CallingConv::ID CallConv, bool isVarArg, 21702 const DataLayout &DL) const { 21703 if (getEffectiveCallingConv(CallConv, isVarArg) != 21704 CallingConv::ARM_AAPCS_VFP) 21705 return false; 21706 21707 HABaseType Base = HA_UNKNOWN; 21708 uint64_t Members = 0; 21709 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 21710 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 21711 21712 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 21713 return IsHA || IsIntArray; 21714 } 21715 21716 Register ARMTargetLowering::getExceptionPointerRegister( 21717 const Constant *PersonalityFn) const { 21718 // Platforms which do not use SjLj EH may return values in these registers 21719 // via the personality function. 21720 return Subtarget->useSjLjEH() ? Register() : ARM::R0; 21721 } 21722 21723 Register ARMTargetLowering::getExceptionSelectorRegister( 21724 const Constant *PersonalityFn) const { 21725 // Platforms which do not use SjLj EH may return values in these registers 21726 // via the personality function. 21727 return Subtarget->useSjLjEH() ? Register() : ARM::R1; 21728 } 21729 21730 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 21731 // Update IsSplitCSR in ARMFunctionInfo. 21732 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 21733 AFI->setIsSplitCSR(true); 21734 } 21735 21736 void ARMTargetLowering::insertCopiesSplitCSR( 21737 MachineBasicBlock *Entry, 21738 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 21739 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 21740 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 21741 if (!IStart) 21742 return; 21743 21744 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 21745 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 21746 MachineBasicBlock::iterator MBBI = Entry->begin(); 21747 for (const MCPhysReg *I = IStart; *I; ++I) { 21748 const TargetRegisterClass *RC = nullptr; 21749 if (ARM::GPRRegClass.contains(*I)) 21750 RC = &ARM::GPRRegClass; 21751 else if (ARM::DPRRegClass.contains(*I)) 21752 RC = &ARM::DPRRegClass; 21753 else 21754 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 21755 21756 Register NewVR = MRI->createVirtualRegister(RC); 21757 // Create copy from CSR to a virtual register. 21758 // FIXME: this currently does not emit CFI pseudo-instructions, it works 21759 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 21760 // nounwind. If we want to generalize this later, we may need to emit 21761 // CFI pseudo-instructions. 21762 assert(Entry->getParent()->getFunction().hasFnAttribute( 21763 Attribute::NoUnwind) && 21764 "Function should be nounwind in insertCopiesSplitCSR!"); 21765 Entry->addLiveIn(*I); 21766 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 21767 .addReg(*I); 21768 21769 // Insert the copy-back instructions right before the terminator. 21770 for (auto *Exit : Exits) 21771 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 21772 TII->get(TargetOpcode::COPY), *I) 21773 .addReg(NewVR); 21774 } 21775 } 21776 21777 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 21778 MF.getFrameInfo().computeMaxCallFrameSize(MF); 21779 TargetLoweringBase::finalizeLowering(MF); 21780 } 21781