1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for R600 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "R600ISelLowering.h" 16 #include "AMDGPUFrameLowering.h" 17 #include "AMDGPUIntrinsicInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "R600Defines.h" 20 #include "R600InstrInfo.h" 21 #include "R600MachineFunctionInfo.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/CodeGen/CallingConvLower.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/CodeGen/SelectionDAG.h" 28 #include "llvm/IR/Argument.h" 29 #include "llvm/IR/Function.h" 30 31 using namespace llvm; 32 33 R600TargetLowering::R600TargetLowering(TargetMachine &TM, 34 const AMDGPUSubtarget &STI) 35 : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { 36 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 37 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 38 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 39 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 40 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); 41 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); 42 43 computeRegisterProperties(STI.getRegisterInfo()); 44 45 // Set condition code actions 46 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 47 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 48 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 49 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 50 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 51 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 52 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 53 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 54 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 55 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 56 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 57 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 58 59 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 60 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 61 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 62 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 63 64 setOperationAction(ISD::FCOS, MVT::f32, Custom); 65 setOperationAction(ISD::FSIN, MVT::f32, Custom); 66 67 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 68 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 69 70 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 71 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 72 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 73 74 setOperationAction(ISD::FSUB, MVT::f32, Expand); 75 76 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 77 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 78 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 79 80 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 81 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 82 83 setOperationAction(ISD::SETCC, MVT::i32, Expand); 84 setOperationAction(ISD::SETCC, MVT::f32, Expand); 85 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 86 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 87 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 88 89 setOperationAction(ISD::SELECT, MVT::i32, Expand); 90 setOperationAction(ISD::SELECT, MVT::f32, Expand); 91 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 92 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 93 94 // ADD, SUB overflow. 95 // TODO: turn these into Legal? 96 if (Subtarget->hasCARRY()) 97 setOperationAction(ISD::UADDO, MVT::i32, Custom); 98 99 if (Subtarget->hasBORROW()) 100 setOperationAction(ISD::USUBO, MVT::i32, Custom); 101 102 // Expand sign extension of vectors 103 if (!Subtarget->hasBFE()) 104 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 105 106 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); 107 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); 108 109 if (!Subtarget->hasBFE()) 110 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); 112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); 113 114 if (!Subtarget->hasBFE()) 115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 116 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); 118 119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); 121 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); 122 123 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); 124 125 126 // Legalize loads and stores to the private address space. 127 setOperationAction(ISD::LOAD, MVT::i32, Custom); 128 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 129 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 130 131 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 132 // spaces, so it is custom lowered to handle those where it isn't. 133 for (MVT VT : MVT::integer_valuetypes()) { 134 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 135 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); 136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); 137 138 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 139 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); 140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); 141 142 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 143 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); 144 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); 145 } 146 147 setOperationAction(ISD::STORE, MVT::i8, Custom); 148 setOperationAction(ISD::STORE, MVT::i32, Custom); 149 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 150 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 151 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 152 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 153 154 setOperationAction(ISD::LOAD, MVT::i32, Custom); 155 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 156 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 157 158 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); 159 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); 160 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 161 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 162 163 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); 164 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); 165 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 166 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 167 168 setTargetDAGCombine(ISD::FP_ROUND); 169 setTargetDAGCombine(ISD::FP_TO_SINT); 170 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 171 setTargetDAGCombine(ISD::SELECT_CC); 172 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 173 174 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 175 // to be Legal/Custom in order to avoid library calls. 176 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 177 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 178 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 179 180 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 181 182 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 183 for (MVT VT : ScalarIntVTs) { 184 setOperationAction(ISD::ADDC, VT, Expand); 185 setOperationAction(ISD::SUBC, VT, Expand); 186 setOperationAction(ISD::ADDE, VT, Expand); 187 setOperationAction(ISD::SUBE, VT, Expand); 188 } 189 190 setSchedulingPreference(Sched::Source); 191 } 192 193 static inline bool isEOP(MachineBasicBlock::iterator I) { 194 return std::next(I)->getOpcode() == AMDGPU::RETURN; 195 } 196 197 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 198 MachineInstr * MI, MachineBasicBlock * BB) const { 199 MachineFunction * MF = BB->getParent(); 200 MachineRegisterInfo &MRI = MF->getRegInfo(); 201 MachineBasicBlock::iterator I = *MI; 202 const R600InstrInfo *TII = 203 static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); 204 205 switch (MI->getOpcode()) { 206 default: 207 // Replace LDS_*_RET instruction that don't have any uses with the 208 // equivalent LDS_*_NORET instruction. 209 if (TII->isLDSRetInstr(MI->getOpcode())) { 210 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); 211 assert(DstIdx != -1); 212 MachineInstrBuilder NewMI; 213 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add 214 // LDS_1A2D support and remove this special case. 215 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || 216 MI->getOpcode() == AMDGPU::LDS_CMPST_RET) 217 return BB; 218 219 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 220 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); 221 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { 222 NewMI.addOperand(MI->getOperand(i)); 223 } 224 } else { 225 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 226 } 227 break; 228 case AMDGPU::CLAMP_R600: { 229 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 230 AMDGPU::MOV, 231 MI->getOperand(0).getReg(), 232 MI->getOperand(1).getReg()); 233 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 234 break; 235 } 236 237 case AMDGPU::FABS_R600: { 238 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 239 AMDGPU::MOV, 240 MI->getOperand(0).getReg(), 241 MI->getOperand(1).getReg()); 242 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 243 break; 244 } 245 246 case AMDGPU::FNEG_R600: { 247 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 248 AMDGPU::MOV, 249 MI->getOperand(0).getReg(), 250 MI->getOperand(1).getReg()); 251 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 252 break; 253 } 254 255 case AMDGPU::MASK_WRITE: { 256 unsigned maskedRegister = MI->getOperand(0).getReg(); 257 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 258 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 259 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 260 break; 261 } 262 263 case AMDGPU::MOV_IMM_F32: 264 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 265 MI->getOperand(1).getFPImm()->getValueAPF() 266 .bitcastToAPInt().getZExtValue()); 267 break; 268 case AMDGPU::MOV_IMM_I32: 269 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 270 MI->getOperand(1).getImm()); 271 break; 272 case AMDGPU::CONST_COPY: { 273 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 274 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 275 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, 276 MI->getOperand(1).getImm()); 277 break; 278 } 279 280 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 281 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 282 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 283 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 284 .addOperand(MI->getOperand(0)) 285 .addOperand(MI->getOperand(1)) 286 .addImm(isEOP(I)); // Set End of program bit 287 break; 288 } 289 case AMDGPU::RAT_STORE_TYPED_eg: { 290 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 291 .addOperand(MI->getOperand(0)) 292 .addOperand(MI->getOperand(1)) 293 .addOperand(MI->getOperand(2)) 294 .addImm(isEOP(I)); // Set End of program bit 295 break; 296 } 297 298 case AMDGPU::TXD: { 299 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 300 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 301 MachineOperand &RID = MI->getOperand(4); 302 MachineOperand &SID = MI->getOperand(5); 303 unsigned TextureId = MI->getOperand(6).getImm(); 304 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 305 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 306 307 switch (TextureId) { 308 case 5: // Rect 309 CTX = CTY = 0; 310 break; 311 case 6: // Shadow1D 312 SrcW = SrcZ; 313 break; 314 case 7: // Shadow2D 315 SrcW = SrcZ; 316 break; 317 case 8: // ShadowRect 318 CTX = CTY = 0; 319 SrcW = SrcZ; 320 break; 321 case 9: // 1DArray 322 SrcZ = SrcY; 323 CTZ = 0; 324 break; 325 case 10: // 2DArray 326 CTZ = 0; 327 break; 328 case 11: // Shadow1DArray 329 SrcZ = SrcY; 330 CTZ = 0; 331 break; 332 case 12: // Shadow2DArray 333 CTZ = 0; 334 break; 335 } 336 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 337 .addOperand(MI->getOperand(3)) 338 .addImm(SrcX) 339 .addImm(SrcY) 340 .addImm(SrcZ) 341 .addImm(SrcW) 342 .addImm(0) 343 .addImm(0) 344 .addImm(0) 345 .addImm(0) 346 .addImm(1) 347 .addImm(2) 348 .addImm(3) 349 .addOperand(RID) 350 .addOperand(SID) 351 .addImm(CTX) 352 .addImm(CTY) 353 .addImm(CTZ) 354 .addImm(CTW); 355 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 356 .addOperand(MI->getOperand(2)) 357 .addImm(SrcX) 358 .addImm(SrcY) 359 .addImm(SrcZ) 360 .addImm(SrcW) 361 .addImm(0) 362 .addImm(0) 363 .addImm(0) 364 .addImm(0) 365 .addImm(1) 366 .addImm(2) 367 .addImm(3) 368 .addOperand(RID) 369 .addOperand(SID) 370 .addImm(CTX) 371 .addImm(CTY) 372 .addImm(CTZ) 373 .addImm(CTW); 374 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 375 .addOperand(MI->getOperand(0)) 376 .addOperand(MI->getOperand(1)) 377 .addImm(SrcX) 378 .addImm(SrcY) 379 .addImm(SrcZ) 380 .addImm(SrcW) 381 .addImm(0) 382 .addImm(0) 383 .addImm(0) 384 .addImm(0) 385 .addImm(1) 386 .addImm(2) 387 .addImm(3) 388 .addOperand(RID) 389 .addOperand(SID) 390 .addImm(CTX) 391 .addImm(CTY) 392 .addImm(CTZ) 393 .addImm(CTW) 394 .addReg(T0, RegState::Implicit) 395 .addReg(T1, RegState::Implicit); 396 break; 397 } 398 399 case AMDGPU::TXD_SHADOW: { 400 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 401 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 402 MachineOperand &RID = MI->getOperand(4); 403 MachineOperand &SID = MI->getOperand(5); 404 unsigned TextureId = MI->getOperand(6).getImm(); 405 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 406 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 407 408 switch (TextureId) { 409 case 5: // Rect 410 CTX = CTY = 0; 411 break; 412 case 6: // Shadow1D 413 SrcW = SrcZ; 414 break; 415 case 7: // Shadow2D 416 SrcW = SrcZ; 417 break; 418 case 8: // ShadowRect 419 CTX = CTY = 0; 420 SrcW = SrcZ; 421 break; 422 case 9: // 1DArray 423 SrcZ = SrcY; 424 CTZ = 0; 425 break; 426 case 10: // 2DArray 427 CTZ = 0; 428 break; 429 case 11: // Shadow1DArray 430 SrcZ = SrcY; 431 CTZ = 0; 432 break; 433 case 12: // Shadow2DArray 434 CTZ = 0; 435 break; 436 } 437 438 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 439 .addOperand(MI->getOperand(3)) 440 .addImm(SrcX) 441 .addImm(SrcY) 442 .addImm(SrcZ) 443 .addImm(SrcW) 444 .addImm(0) 445 .addImm(0) 446 .addImm(0) 447 .addImm(0) 448 .addImm(1) 449 .addImm(2) 450 .addImm(3) 451 .addOperand(RID) 452 .addOperand(SID) 453 .addImm(CTX) 454 .addImm(CTY) 455 .addImm(CTZ) 456 .addImm(CTW); 457 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 458 .addOperand(MI->getOperand(2)) 459 .addImm(SrcX) 460 .addImm(SrcY) 461 .addImm(SrcZ) 462 .addImm(SrcW) 463 .addImm(0) 464 .addImm(0) 465 .addImm(0) 466 .addImm(0) 467 .addImm(1) 468 .addImm(2) 469 .addImm(3) 470 .addOperand(RID) 471 .addOperand(SID) 472 .addImm(CTX) 473 .addImm(CTY) 474 .addImm(CTZ) 475 .addImm(CTW); 476 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 477 .addOperand(MI->getOperand(0)) 478 .addOperand(MI->getOperand(1)) 479 .addImm(SrcX) 480 .addImm(SrcY) 481 .addImm(SrcZ) 482 .addImm(SrcW) 483 .addImm(0) 484 .addImm(0) 485 .addImm(0) 486 .addImm(0) 487 .addImm(1) 488 .addImm(2) 489 .addImm(3) 490 .addOperand(RID) 491 .addOperand(SID) 492 .addImm(CTX) 493 .addImm(CTY) 494 .addImm(CTZ) 495 .addImm(CTW) 496 .addReg(T0, RegState::Implicit) 497 .addReg(T1, RegState::Implicit); 498 break; 499 } 500 501 case AMDGPU::BRANCH: 502 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 503 .addOperand(MI->getOperand(0)); 504 break; 505 506 case AMDGPU::BRANCH_COND_f32: { 507 MachineInstr *NewMI = 508 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 509 AMDGPU::PREDICATE_BIT) 510 .addOperand(MI->getOperand(1)) 511 .addImm(OPCODE_IS_NOT_ZERO) 512 .addImm(0); // Flags 513 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 514 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 515 .addOperand(MI->getOperand(0)) 516 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 517 break; 518 } 519 520 case AMDGPU::BRANCH_COND_i32: { 521 MachineInstr *NewMI = 522 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 523 AMDGPU::PREDICATE_BIT) 524 .addOperand(MI->getOperand(1)) 525 .addImm(OPCODE_IS_NOT_ZERO_INT) 526 .addImm(0); // Flags 527 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 528 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 529 .addOperand(MI->getOperand(0)) 530 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 531 break; 532 } 533 534 case AMDGPU::EG_ExportSwz: 535 case AMDGPU::R600_ExportSwz: { 536 // Instruction is left unmodified if its not the last one of its type 537 bool isLastInstructionOfItsType = true; 538 unsigned InstExportType = MI->getOperand(1).getImm(); 539 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 540 EndBlock = BB->end(); NextExportInst != EndBlock; 541 NextExportInst = std::next(NextExportInst)) { 542 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 543 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 544 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 545 .getImm(); 546 if (CurrentInstExportType == InstExportType) { 547 isLastInstructionOfItsType = false; 548 break; 549 } 550 } 551 } 552 bool EOP = isEOP(I); 553 if (!EOP && !isLastInstructionOfItsType) 554 return BB; 555 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 556 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 557 .addOperand(MI->getOperand(0)) 558 .addOperand(MI->getOperand(1)) 559 .addOperand(MI->getOperand(2)) 560 .addOperand(MI->getOperand(3)) 561 .addOperand(MI->getOperand(4)) 562 .addOperand(MI->getOperand(5)) 563 .addOperand(MI->getOperand(6)) 564 .addImm(CfInst) 565 .addImm(EOP); 566 break; 567 } 568 case AMDGPU::RETURN: { 569 // RETURN instructions must have the live-out registers as implicit uses, 570 // otherwise they appear dead. 571 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 572 MachineInstrBuilder MIB(*MF, MI); 573 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 574 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 575 return BB; 576 } 577 } 578 579 MI->eraseFromParent(); 580 return BB; 581 } 582 583 //===----------------------------------------------------------------------===// 584 // Custom DAG Lowering Operations 585 //===----------------------------------------------------------------------===// 586 587 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 588 MachineFunction &MF = DAG.getMachineFunction(); 589 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 590 switch (Op.getOpcode()) { 591 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 592 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 593 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 594 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); 595 case ISD::SRA_PARTS: 596 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); 597 case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); 598 case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); 599 case ISD::FCOS: 600 case ISD::FSIN: return LowerTrig(Op, DAG); 601 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 602 case ISD::STORE: return LowerSTORE(Op, DAG); 603 case ISD::LOAD: { 604 SDValue Result = LowerLOAD(Op, DAG); 605 assert((!Result.getNode() || 606 Result.getNode()->getNumValues() == 2) && 607 "Load should return a value and a chain"); 608 return Result; 609 } 610 611 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 612 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 613 case ISD::FrameIndex: return lowerFrameIndex(Op, DAG); 614 case ISD::INTRINSIC_VOID: { 615 SDValue Chain = Op.getOperand(0); 616 unsigned IntrinsicID = 617 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 618 switch (IntrinsicID) { 619 case AMDGPUIntrinsic::R600_store_swizzle: { 620 SDLoc DL(Op); 621 const SDValue Args[8] = { 622 Chain, 623 Op.getOperand(2), // Export Value 624 Op.getOperand(3), // ArrayBase 625 Op.getOperand(4), // Type 626 DAG.getConstant(0, DL, MVT::i32), // SWZ_X 627 DAG.getConstant(1, DL, MVT::i32), // SWZ_Y 628 DAG.getConstant(2, DL, MVT::i32), // SWZ_Z 629 DAG.getConstant(3, DL, MVT::i32) // SWZ_W 630 }; 631 return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); 632 } 633 634 // default for switch(IntrinsicID) 635 default: break; 636 } 637 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 638 break; 639 } 640 case ISD::INTRINSIC_WO_CHAIN: { 641 unsigned IntrinsicID = 642 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 643 EVT VT = Op.getValueType(); 644 SDLoc DL(Op); 645 switch(IntrinsicID) { 646 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 647 case AMDGPUIntrinsic::R600_interp_xy: 648 case AMDGPUIntrinsic::R600_interp_zw: { 649 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 650 MachineSDNode *interp; 651 SDValue RegisterINode = Op.getOperand(2); 652 SDValue RegisterJNode = Op.getOperand(3); 653 654 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) 655 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 656 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), 657 RegisterJNode, RegisterINode); 658 else 659 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 660 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), 661 RegisterJNode, RegisterINode); 662 return DAG.getBuildVector(MVT::v2f32, DL, 663 {SDValue(interp, 0), SDValue(interp, 1)}); 664 } 665 case AMDGPUIntrinsic::r600_tex: 666 case AMDGPUIntrinsic::r600_texc: 667 case AMDGPUIntrinsic::r600_txl: 668 case AMDGPUIntrinsic::r600_txlc: 669 case AMDGPUIntrinsic::r600_txb: 670 case AMDGPUIntrinsic::r600_txbc: 671 case AMDGPUIntrinsic::r600_txf: 672 case AMDGPUIntrinsic::r600_txq: 673 case AMDGPUIntrinsic::r600_ddx: 674 case AMDGPUIntrinsic::r600_ddy: 675 case AMDGPUIntrinsic::R600_ldptr: { 676 unsigned TextureOp; 677 switch (IntrinsicID) { 678 case AMDGPUIntrinsic::r600_tex: 679 TextureOp = 0; 680 break; 681 case AMDGPUIntrinsic::r600_texc: 682 TextureOp = 1; 683 break; 684 case AMDGPUIntrinsic::r600_txl: 685 TextureOp = 2; 686 break; 687 case AMDGPUIntrinsic::r600_txlc: 688 TextureOp = 3; 689 break; 690 case AMDGPUIntrinsic::r600_txb: 691 TextureOp = 4; 692 break; 693 case AMDGPUIntrinsic::r600_txbc: 694 TextureOp = 5; 695 break; 696 case AMDGPUIntrinsic::r600_txf: 697 TextureOp = 6; 698 break; 699 case AMDGPUIntrinsic::r600_txq: 700 TextureOp = 7; 701 break; 702 case AMDGPUIntrinsic::r600_ddx: 703 TextureOp = 8; 704 break; 705 case AMDGPUIntrinsic::r600_ddy: 706 TextureOp = 9; 707 break; 708 case AMDGPUIntrinsic::R600_ldptr: 709 TextureOp = 10; 710 break; 711 default: 712 llvm_unreachable("Unknow Texture Operation"); 713 } 714 715 SDValue TexArgs[19] = { 716 DAG.getConstant(TextureOp, DL, MVT::i32), 717 Op.getOperand(1), 718 DAG.getConstant(0, DL, MVT::i32), 719 DAG.getConstant(1, DL, MVT::i32), 720 DAG.getConstant(2, DL, MVT::i32), 721 DAG.getConstant(3, DL, MVT::i32), 722 Op.getOperand(2), 723 Op.getOperand(3), 724 Op.getOperand(4), 725 DAG.getConstant(0, DL, MVT::i32), 726 DAG.getConstant(1, DL, MVT::i32), 727 DAG.getConstant(2, DL, MVT::i32), 728 DAG.getConstant(3, DL, MVT::i32), 729 Op.getOperand(5), 730 Op.getOperand(6), 731 Op.getOperand(7), 732 Op.getOperand(8), 733 Op.getOperand(9), 734 Op.getOperand(10) 735 }; 736 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); 737 } 738 case AMDGPUIntrinsic::AMDGPU_dp4: { 739 SDValue Args[8] = { 740 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 741 DAG.getConstant(0, DL, MVT::i32)), 742 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 743 DAG.getConstant(0, DL, MVT::i32)), 744 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 745 DAG.getConstant(1, DL, MVT::i32)), 746 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 747 DAG.getConstant(1, DL, MVT::i32)), 748 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 749 DAG.getConstant(2, DL, MVT::i32)), 750 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 751 DAG.getConstant(2, DL, MVT::i32)), 752 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 753 DAG.getConstant(3, DL, MVT::i32)), 754 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 755 DAG.getConstant(3, DL, MVT::i32)) 756 }; 757 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); 758 } 759 760 case Intrinsic::r600_read_ngroups_x: 761 return LowerImplicitParameter(DAG, VT, DL, 0); 762 case Intrinsic::r600_read_ngroups_y: 763 return LowerImplicitParameter(DAG, VT, DL, 1); 764 case Intrinsic::r600_read_ngroups_z: 765 return LowerImplicitParameter(DAG, VT, DL, 2); 766 case Intrinsic::r600_read_global_size_x: 767 return LowerImplicitParameter(DAG, VT, DL, 3); 768 case Intrinsic::r600_read_global_size_y: 769 return LowerImplicitParameter(DAG, VT, DL, 4); 770 case Intrinsic::r600_read_global_size_z: 771 return LowerImplicitParameter(DAG, VT, DL, 5); 772 case Intrinsic::r600_read_local_size_x: 773 return LowerImplicitParameter(DAG, VT, DL, 6); 774 case Intrinsic::r600_read_local_size_y: 775 return LowerImplicitParameter(DAG, VT, DL, 7); 776 case Intrinsic::r600_read_local_size_z: 777 return LowerImplicitParameter(DAG, VT, DL, 8); 778 779 case Intrinsic::r600_read_workdim: 780 case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name. 781 uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); 782 return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); 783 } 784 785 case Intrinsic::r600_read_tgid_x: 786 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 787 AMDGPU::T1_X, VT); 788 case Intrinsic::r600_read_tgid_y: 789 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 790 AMDGPU::T1_Y, VT); 791 case Intrinsic::r600_read_tgid_z: 792 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 793 AMDGPU::T1_Z, VT); 794 case Intrinsic::r600_read_tidig_x: 795 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 796 AMDGPU::T0_X, VT); 797 case Intrinsic::r600_read_tidig_y: 798 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 799 AMDGPU::T0_Y, VT); 800 case Intrinsic::r600_read_tidig_z: 801 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 802 AMDGPU::T0_Z, VT); 803 804 // FIXME: Should be renamed to r600 prefix 805 case AMDGPUIntrinsic::AMDGPU_rsq_clamped: 806 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 807 808 case Intrinsic::r600_rsq: 809 case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name 810 // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. 811 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 812 } 813 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 814 break; 815 } 816 } // end switch(Op.getOpcode()) 817 return SDValue(); 818 } 819 820 void R600TargetLowering::ReplaceNodeResults(SDNode *N, 821 SmallVectorImpl<SDValue> &Results, 822 SelectionDAG &DAG) const { 823 switch (N->getOpcode()) { 824 default: 825 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 826 return; 827 case ISD::FP_TO_UINT: 828 if (N->getValueType(0) == MVT::i1) { 829 Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 830 return; 831 } 832 // Fall-through. Since we don't care about out of bounds values 833 // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint 834 // considers some extra cases which are not necessary here. 835 case ISD::FP_TO_SINT: { 836 SDValue Result; 837 if (expandFP_TO_SINT(N, Result, DAG)) 838 Results.push_back(Result); 839 return; 840 } 841 case ISD::SDIVREM: { 842 SDValue Op = SDValue(N, 1); 843 SDValue RES = LowerSDIVREM(Op, DAG); 844 Results.push_back(RES); 845 Results.push_back(RES.getValue(1)); 846 break; 847 } 848 case ISD::UDIVREM: { 849 SDValue Op = SDValue(N, 0); 850 LowerUDIVREM64(Op, DAG, Results); 851 break; 852 } 853 } 854 } 855 856 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, 857 SDValue Vector) const { 858 859 SDLoc DL(Vector); 860 EVT VecVT = Vector.getValueType(); 861 EVT EltVT = VecVT.getVectorElementType(); 862 SmallVector<SDValue, 8> Args; 863 864 for (unsigned i = 0, e = VecVT.getVectorNumElements(); 865 i != e; ++i) { 866 Args.push_back(DAG.getNode( 867 ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, 868 DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout())))); 869 } 870 871 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); 872 } 873 874 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 875 SelectionDAG &DAG) const { 876 877 SDLoc DL(Op); 878 SDValue Vector = Op.getOperand(0); 879 SDValue Index = Op.getOperand(1); 880 881 if (isa<ConstantSDNode>(Index) || 882 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 883 return Op; 884 885 Vector = vectorToVerticalVector(DAG, Vector); 886 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), 887 Vector, Index); 888 } 889 890 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 891 SelectionDAG &DAG) const { 892 SDLoc DL(Op); 893 SDValue Vector = Op.getOperand(0); 894 SDValue Value = Op.getOperand(1); 895 SDValue Index = Op.getOperand(2); 896 897 if (isa<ConstantSDNode>(Index) || 898 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 899 return Op; 900 901 Vector = vectorToVerticalVector(DAG, Vector); 902 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), 903 Vector, Value, Index); 904 return vectorToVerticalVector(DAG, Insert); 905 } 906 907 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 908 // On hw >= R700, COS/SIN input must be between -1. and 1. 909 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 910 EVT VT = Op.getValueType(); 911 SDValue Arg = Op.getOperand(0); 912 SDLoc DL(Op); 913 914 // TODO: Should this propagate fast-math-flags? 915 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 916 DAG.getNode(ISD::FADD, DL, VT, 917 DAG.getNode(ISD::FMUL, DL, VT, Arg, 918 DAG.getConstantFP(0.15915494309, DL, MVT::f32)), 919 DAG.getConstantFP(0.5, DL, MVT::f32))); 920 unsigned TrigNode; 921 switch (Op.getOpcode()) { 922 case ISD::FCOS: 923 TrigNode = AMDGPUISD::COS_HW; 924 break; 925 case ISD::FSIN: 926 TrigNode = AMDGPUISD::SIN_HW; 927 break; 928 default: 929 llvm_unreachable("Wrong trig opcode"); 930 } 931 SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, 932 DAG.getNode(ISD::FADD, DL, VT, FractPart, 933 DAG.getConstantFP(-0.5, DL, MVT::f32))); 934 if (Gen >= AMDGPUSubtarget::R700) 935 return TrigVal; 936 // On R600 hw, COS/SIN input must be between -Pi and Pi. 937 return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, 938 DAG.getConstantFP(3.14159265359, DL, MVT::f32)); 939 } 940 941 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { 942 SDLoc DL(Op); 943 EVT VT = Op.getValueType(); 944 945 SDValue Lo = Op.getOperand(0); 946 SDValue Hi = Op.getOperand(1); 947 SDValue Shift = Op.getOperand(2); 948 SDValue Zero = DAG.getConstant(0, DL, VT); 949 SDValue One = DAG.getConstant(1, DL, VT); 950 951 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); 952 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); 953 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 954 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 955 956 // The dance around Width1 is necessary for 0 special case. 957 // Without it the CompShift might be 32, producing incorrect results in 958 // Overflow. So we do the shift in two steps, the alternative is to 959 // add a conditional to filter the special case. 960 961 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); 962 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); 963 964 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); 965 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); 966 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); 967 968 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); 969 SDValue LoBig = Zero; 970 971 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 972 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 973 974 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 975 } 976 977 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { 978 SDLoc DL(Op); 979 EVT VT = Op.getValueType(); 980 981 SDValue Lo = Op.getOperand(0); 982 SDValue Hi = Op.getOperand(1); 983 SDValue Shift = Op.getOperand(2); 984 SDValue Zero = DAG.getConstant(0, DL, VT); 985 SDValue One = DAG.getConstant(1, DL, VT); 986 987 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; 988 989 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); 990 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); 991 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 992 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 993 994 // The dance around Width1 is necessary for 0 special case. 995 // Without it the CompShift might be 32, producing incorrect results in 996 // Overflow. So we do the shift in two steps, the alternative is to 997 // add a conditional to filter the special case. 998 999 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); 1000 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); 1001 1002 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); 1003 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); 1004 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); 1005 1006 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); 1007 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; 1008 1009 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 1010 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 1011 1012 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 1013 } 1014 1015 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, 1016 unsigned mainop, unsigned ovf) const { 1017 SDLoc DL(Op); 1018 EVT VT = Op.getValueType(); 1019 1020 SDValue Lo = Op.getOperand(0); 1021 SDValue Hi = Op.getOperand(1); 1022 1023 SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); 1024 // Extend sign. 1025 OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, 1026 DAG.getValueType(MVT::i1)); 1027 1028 SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); 1029 1030 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); 1031 } 1032 1033 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 1034 SDLoc DL(Op); 1035 return DAG.getNode( 1036 ISD::SETCC, 1037 DL, 1038 MVT::i1, 1039 Op, DAG.getConstantFP(0.0f, DL, MVT::f32), 1040 DAG.getCondCode(ISD::SETNE) 1041 ); 1042 } 1043 1044 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 1045 SDLoc DL, 1046 unsigned DwordOffset) const { 1047 unsigned ByteOffset = DwordOffset * 4; 1048 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1049 AMDGPUAS::CONSTANT_BUFFER_0); 1050 1051 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 1052 assert(isInt<16>(ByteOffset)); 1053 1054 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 1055 DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR 1056 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 1057 false, false, false, 0); 1058 } 1059 1060 bool R600TargetLowering::isZero(SDValue Op) const { 1061 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 1062 return Cst->isNullValue(); 1063 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 1064 return CstFP->isZero(); 1065 } else { 1066 return false; 1067 } 1068 } 1069 1070 bool R600TargetLowering::isHWTrueValue(SDValue Op) const { 1071 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 1072 return CFP->isExactlyValue(1.0); 1073 } 1074 return isAllOnesConstant(Op); 1075 } 1076 1077 bool R600TargetLowering::isHWFalseValue(SDValue Op) const { 1078 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 1079 return CFP->getValueAPF().isZero(); 1080 } 1081 return isNullConstant(Op); 1082 } 1083 1084 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 1085 SDLoc DL(Op); 1086 EVT VT = Op.getValueType(); 1087 1088 SDValue LHS = Op.getOperand(0); 1089 SDValue RHS = Op.getOperand(1); 1090 SDValue True = Op.getOperand(2); 1091 SDValue False = Op.getOperand(3); 1092 SDValue CC = Op.getOperand(4); 1093 SDValue Temp; 1094 1095 if (VT == MVT::f32) { 1096 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 1097 SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); 1098 if (MinMax) 1099 return MinMax; 1100 } 1101 1102 // LHS and RHS are guaranteed to be the same value type 1103 EVT CompareVT = LHS.getValueType(); 1104 1105 // Check if we can lower this to a native operation. 1106 1107 // Try to lower to a SET* instruction: 1108 // 1109 // SET* can match the following patterns: 1110 // 1111 // select_cc f32, f32, -1, 0, cc_supported 1112 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 1113 // select_cc i32, i32, -1, 0, cc_supported 1114 // 1115 1116 // Move hardware True/False values to the correct operand. 1117 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1118 ISD::CondCode InverseCC = 1119 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1120 if (isHWTrueValue(False) && isHWFalseValue(True)) { 1121 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 1122 std::swap(False, True); 1123 CC = DAG.getCondCode(InverseCC); 1124 } else { 1125 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 1126 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 1127 std::swap(False, True); 1128 std::swap(LHS, RHS); 1129 CC = DAG.getCondCode(SwapInvCC); 1130 } 1131 } 1132 } 1133 1134 if (isHWTrueValue(True) && isHWFalseValue(False) && 1135 (CompareVT == VT || VT == MVT::i32)) { 1136 // This can be matched by a SET* instruction. 1137 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 1138 } 1139 1140 // Try to lower to a CND* instruction: 1141 // 1142 // CND* can match the following patterns: 1143 // 1144 // select_cc f32, 0.0, f32, f32, cc_supported 1145 // select_cc f32, 0.0, i32, i32, cc_supported 1146 // select_cc i32, 0, f32, f32, cc_supported 1147 // select_cc i32, 0, i32, i32, cc_supported 1148 // 1149 1150 // Try to move the zero value to the RHS 1151 if (isZero(LHS)) { 1152 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1153 // Try swapping the operands 1154 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 1155 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1156 std::swap(LHS, RHS); 1157 CC = DAG.getCondCode(CCSwapped); 1158 } else { 1159 // Try inverting the conditon and then swapping the operands 1160 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); 1161 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 1162 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1163 std::swap(True, False); 1164 std::swap(LHS, RHS); 1165 CC = DAG.getCondCode(CCSwapped); 1166 } 1167 } 1168 } 1169 if (isZero(RHS)) { 1170 SDValue Cond = LHS; 1171 SDValue Zero = RHS; 1172 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1173 if (CompareVT != VT) { 1174 // Bitcast True / False to the correct types. This will end up being 1175 // a nop, but it allows us to define only a single pattern in the 1176 // .TD files for each CND* instruction rather than having to have 1177 // one pattern for integer True/False and one for fp True/False 1178 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 1179 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 1180 } 1181 1182 switch (CCOpcode) { 1183 case ISD::SETONE: 1184 case ISD::SETUNE: 1185 case ISD::SETNE: 1186 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1187 Temp = True; 1188 True = False; 1189 False = Temp; 1190 break; 1191 default: 1192 break; 1193 } 1194 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 1195 Cond, Zero, 1196 True, False, 1197 DAG.getCondCode(CCOpcode)); 1198 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 1199 } 1200 1201 // If we make it this for it means we have no native instructions to handle 1202 // this SELECT_CC, so we must lower it. 1203 SDValue HWTrue, HWFalse; 1204 1205 if (CompareVT == MVT::f32) { 1206 HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); 1207 HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); 1208 } else if (CompareVT == MVT::i32) { 1209 HWTrue = DAG.getConstant(-1, DL, CompareVT); 1210 HWFalse = DAG.getConstant(0, DL, CompareVT); 1211 } 1212 else { 1213 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 1214 } 1215 1216 // Lower this unsupported SELECT_CC into a combination of two supported 1217 // SELECT_CC operations. 1218 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 1219 1220 return DAG.getNode(ISD::SELECT_CC, DL, VT, 1221 Cond, HWFalse, 1222 True, False, 1223 DAG.getCondCode(ISD::SETNE)); 1224 } 1225 1226 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to 1227 /// convert these pointers to a register index. Each register holds 1228 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 1229 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 1230 /// for indirect addressing. 1231 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 1232 unsigned StackWidth, 1233 SelectionDAG &DAG) const { 1234 unsigned SRLPad; 1235 switch(StackWidth) { 1236 case 1: 1237 SRLPad = 2; 1238 break; 1239 case 2: 1240 SRLPad = 3; 1241 break; 1242 case 4: 1243 SRLPad = 4; 1244 break; 1245 default: llvm_unreachable("Invalid stack width"); 1246 } 1247 1248 SDLoc DL(Ptr); 1249 return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, 1250 DAG.getConstant(SRLPad, DL, MVT::i32)); 1251 } 1252 1253 void R600TargetLowering::getStackAddress(unsigned StackWidth, 1254 unsigned ElemIdx, 1255 unsigned &Channel, 1256 unsigned &PtrIncr) const { 1257 switch (StackWidth) { 1258 default: 1259 case 1: 1260 Channel = 0; 1261 if (ElemIdx > 0) { 1262 PtrIncr = 1; 1263 } else { 1264 PtrIncr = 0; 1265 } 1266 break; 1267 case 2: 1268 Channel = ElemIdx % 2; 1269 if (ElemIdx == 2) { 1270 PtrIncr = 1; 1271 } else { 1272 PtrIncr = 0; 1273 } 1274 break; 1275 case 4: 1276 Channel = ElemIdx; 1277 PtrIncr = 0; 1278 break; 1279 } 1280 } 1281 1282 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, 1283 SelectionDAG &DAG) const { 1284 SDLoc DL(Store); 1285 1286 unsigned Mask = 0; 1287 if (Store->getMemoryVT() == MVT::i8) { 1288 Mask = 0xff; 1289 } else if (Store->getMemoryVT() == MVT::i16) { 1290 Mask = 0xffff; 1291 } 1292 1293 SDValue Chain = Store->getChain(); 1294 SDValue BasePtr = Store->getBasePtr(); 1295 EVT MemVT = Store->getMemoryVT(); 1296 1297 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, 1298 DAG.getConstant(2, DL, MVT::i32)); 1299 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 1300 Chain, Ptr, 1301 DAG.getTargetConstant(0, DL, MVT::i32)); 1302 1303 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, 1304 DAG.getConstant(0x3, DL, MVT::i32)); 1305 1306 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1307 DAG.getConstant(3, DL, MVT::i32)); 1308 1309 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 1310 Store->getValue()); 1311 1312 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 1313 1314 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 1315 MaskedValue, ShiftAmt); 1316 1317 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, 1318 DAG.getConstant(Mask, DL, MVT::i32), 1319 ShiftAmt); 1320 DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, 1321 DAG.getConstant(0xffffffff, DL, MVT::i32)); 1322 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 1323 1324 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 1325 return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1326 Chain, Value, Ptr, 1327 DAG.getTargetConstant(0, DL, MVT::i32)); 1328 } 1329 1330 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1331 if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG)) 1332 return Result; 1333 1334 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1335 unsigned AS = StoreNode->getAddressSpace(); 1336 SDValue Value = StoreNode->getValue(); 1337 EVT ValueVT = Value.getValueType(); 1338 1339 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && 1340 ValueVT.isVector()) { 1341 return SplitVectorStore(Op, DAG); 1342 } 1343 1344 SDLoc DL(Op); 1345 SDValue Chain = StoreNode->getChain(); 1346 SDValue Ptr = StoreNode->getBasePtr(); 1347 1348 if (AS == AMDGPUAS::GLOBAL_ADDRESS) { 1349 if (StoreNode->isTruncatingStore()) { 1350 EVT VT = Value.getValueType(); 1351 assert(VT.bitsLE(MVT::i32)); 1352 EVT MemVT = StoreNode->getMemoryVT(); 1353 SDValue MaskConstant; 1354 if (MemVT == MVT::i8) { 1355 MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); 1356 } else { 1357 assert(MemVT == MVT::i16); 1358 MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); 1359 } 1360 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, 1361 DAG.getConstant(2, DL, MVT::i32)); 1362 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, 1363 DAG.getConstant(0x00000003, DL, VT)); 1364 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1365 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1366 DAG.getConstant(3, DL, VT)); 1367 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); 1368 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); 1369 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1370 // vector instead. 1371 SDValue Src[4] = { 1372 ShiftedValue, 1373 DAG.getConstant(0, DL, MVT::i32), 1374 DAG.getConstant(0, DL, MVT::i32), 1375 Mask 1376 }; 1377 SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src); 1378 SDValue Args[3] = { Chain, Input, DWordAddr }; 1379 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1380 Op->getVTList(), Args, MemVT, 1381 StoreNode->getMemOperand()); 1382 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && 1383 ValueVT.bitsGE(MVT::i32)) { 1384 // Convert pointer from byte address to dword address. 1385 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 1386 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 1387 Ptr, DAG.getConstant(2, DL, MVT::i32))); 1388 1389 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 1390 llvm_unreachable("Truncated and indexed stores not supported yet"); 1391 } else { 1392 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1393 } 1394 return Chain; 1395 } 1396 } 1397 1398 if (AS != AMDGPUAS::PRIVATE_ADDRESS) 1399 return SDValue(); 1400 1401 EVT MemVT = StoreNode->getMemoryVT(); 1402 if (MemVT.bitsLT(MVT::i32)) 1403 return lowerPrivateTruncStore(StoreNode, DAG); 1404 1405 // Lowering for indirect addressing 1406 const MachineFunction &MF = DAG.getMachineFunction(); 1407 const AMDGPUFrameLowering *TFL = 1408 static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); 1409 unsigned StackWidth = TFL->getStackWidth(MF); 1410 1411 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1412 1413 if (ValueVT.isVector()) { 1414 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1415 EVT ElemVT = ValueVT.getVectorElementType(); 1416 SmallVector<SDValue, 4> Stores(NumElemVT); 1417 1418 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1419 "vector width in load"); 1420 1421 for (unsigned i = 0; i < NumElemVT; ++i) { 1422 unsigned Channel, PtrIncr; 1423 getStackAddress(StackWidth, i, Channel, PtrIncr); 1424 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1425 DAG.getConstant(PtrIncr, DL, MVT::i32)); 1426 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1427 Value, DAG.getConstant(i, DL, MVT::i32)); 1428 1429 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1430 Chain, Elem, Ptr, 1431 DAG.getTargetConstant(Channel, DL, MVT::i32)); 1432 } 1433 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 1434 } else { 1435 if (ValueVT == MVT::i8) { 1436 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1437 } 1438 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1439 DAG.getTargetConstant(0, DL, MVT::i32)); // Channel 1440 } 1441 1442 return Chain; 1443 } 1444 1445 // return (512 + (kc_bank << 12) 1446 static int 1447 ConstantAddressBlock(unsigned AddressSpace) { 1448 switch (AddressSpace) { 1449 case AMDGPUAS::CONSTANT_BUFFER_0: 1450 return 512; 1451 case AMDGPUAS::CONSTANT_BUFFER_1: 1452 return 512 + 4096; 1453 case AMDGPUAS::CONSTANT_BUFFER_2: 1454 return 512 + 4096 * 2; 1455 case AMDGPUAS::CONSTANT_BUFFER_3: 1456 return 512 + 4096 * 3; 1457 case AMDGPUAS::CONSTANT_BUFFER_4: 1458 return 512 + 4096 * 4; 1459 case AMDGPUAS::CONSTANT_BUFFER_5: 1460 return 512 + 4096 * 5; 1461 case AMDGPUAS::CONSTANT_BUFFER_6: 1462 return 512 + 4096 * 6; 1463 case AMDGPUAS::CONSTANT_BUFFER_7: 1464 return 512 + 4096 * 7; 1465 case AMDGPUAS::CONSTANT_BUFFER_8: 1466 return 512 + 4096 * 8; 1467 case AMDGPUAS::CONSTANT_BUFFER_9: 1468 return 512 + 4096 * 9; 1469 case AMDGPUAS::CONSTANT_BUFFER_10: 1470 return 512 + 4096 * 10; 1471 case AMDGPUAS::CONSTANT_BUFFER_11: 1472 return 512 + 4096 * 11; 1473 case AMDGPUAS::CONSTANT_BUFFER_12: 1474 return 512 + 4096 * 12; 1475 case AMDGPUAS::CONSTANT_BUFFER_13: 1476 return 512 + 4096 * 13; 1477 case AMDGPUAS::CONSTANT_BUFFER_14: 1478 return 512 + 4096 * 14; 1479 case AMDGPUAS::CONSTANT_BUFFER_15: 1480 return 512 + 4096 * 15; 1481 default: 1482 return -1; 1483 } 1484 } 1485 1486 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, 1487 SelectionDAG &DAG) const { 1488 SDLoc DL(Op); 1489 LoadSDNode *Load = cast<LoadSDNode>(Op); 1490 ISD::LoadExtType ExtType = Load->getExtensionType(); 1491 EVT MemVT = Load->getMemoryVT(); 1492 1493 // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, 1494 // register (2-)byte extract. 1495 1496 // Get Register holding the target. 1497 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), 1498 DAG.getConstant(2, DL, MVT::i32)); 1499 // Load the Register. 1500 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 1501 Load->getChain(), 1502 Ptr, 1503 DAG.getTargetConstant(0, DL, MVT::i32), 1504 Op.getOperand(2)); 1505 1506 // Get offset within the register. 1507 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 1508 Load->getBasePtr(), 1509 DAG.getConstant(0x3, DL, MVT::i32)); 1510 1511 // Bit offset of target byte (byteIdx * 8). 1512 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1513 DAG.getConstant(3, DL, MVT::i32)); 1514 1515 // Shift to the right. 1516 Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); 1517 1518 // Eliminate the upper bits by setting them to ... 1519 EVT MemEltVT = MemVT.getScalarType(); 1520 1521 // ... ones. 1522 if (ExtType == ISD::SEXTLOAD) { 1523 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 1524 1525 SDValue Ops[] = { 1526 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), 1527 Load->getChain() 1528 }; 1529 1530 return DAG.getMergeValues(Ops, DL); 1531 } 1532 1533 // ... or zeros. 1534 SDValue Ops[] = { 1535 DAG.getZeroExtendInReg(Ret, DL, MemEltVT), 1536 Load->getChain() 1537 }; 1538 1539 return DAG.getMergeValues(Ops, DL); 1540 } 1541 1542 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1543 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1544 unsigned AS = LoadNode->getAddressSpace(); 1545 EVT MemVT = LoadNode->getMemoryVT(); 1546 ISD::LoadExtType ExtType = LoadNode->getExtensionType(); 1547 1548 if (AS == AMDGPUAS::PRIVATE_ADDRESS && 1549 ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { 1550 return lowerPrivateExtLoad(Op, DAG); 1551 } 1552 1553 SDLoc DL(Op); 1554 EVT VT = Op.getValueType(); 1555 SDValue Chain = LoadNode->getChain(); 1556 SDValue Ptr = LoadNode->getBasePtr(); 1557 1558 // Lower loads constant address space global variable loads 1559 if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && 1560 isa<GlobalVariable>(GetUnderlyingObject( 1561 LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) { 1562 1563 SDValue Ptr = DAG.getZExtOrTrunc( 1564 LoadNode->getBasePtr(), DL, 1565 getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS)); 1566 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1567 DAG.getConstant(2, DL, MVT::i32)); 1568 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), 1569 LoadNode->getChain(), Ptr, 1570 DAG.getTargetConstant(0, DL, MVT::i32), 1571 Op.getOperand(2)); 1572 } 1573 1574 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { 1575 SDValue MergedValues[2] = { 1576 scalarizeVectorLoad(LoadNode, DAG), 1577 Chain 1578 }; 1579 return DAG.getMergeValues(MergedValues, DL); 1580 } 1581 1582 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1583 if (ConstantBlock > -1 && 1584 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1585 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1586 SDValue Result; 1587 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || 1588 isa<Constant>(LoadNode->getMemOperand()->getValue()) || 1589 isa<ConstantSDNode>(Ptr)) { 1590 SDValue Slots[4]; 1591 for (unsigned i = 0; i < 4; i++) { 1592 // We want Const position encoded with the following formula : 1593 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1594 // const_index is Ptr computed by llvm using an alignment of 16. 1595 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1596 // then div by 4 at the ISel step 1597 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1598 DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); 1599 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1600 } 1601 EVT NewVT = MVT::v4i32; 1602 unsigned NumElements = 4; 1603 if (VT.isVector()) { 1604 NewVT = VT; 1605 NumElements = VT.getVectorNumElements(); 1606 } 1607 Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); 1608 } else { 1609 // non-constant ptr can't be folded, keeps it as a v4f32 load 1610 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1611 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1612 DAG.getConstant(4, DL, MVT::i32)), 1613 DAG.getConstant(LoadNode->getAddressSpace() - 1614 AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) 1615 ); 1616 } 1617 1618 if (!VT.isVector()) { 1619 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1620 DAG.getConstant(0, DL, MVT::i32)); 1621 } 1622 1623 SDValue MergedValues[2] = { 1624 Result, 1625 Chain 1626 }; 1627 return DAG.getMergeValues(MergedValues, DL); 1628 } 1629 1630 SDValue LoweredLoad; 1631 1632 // For most operations returning SDValue() will result in the node being 1633 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1634 // need to manually expand loads that may be legal in some address spaces and 1635 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1636 // compute shaders, since the data is sign extended when it is uploaded to the 1637 // buffer. However SEXT loads from other address spaces are not supported, so 1638 // we need to expand them here. 1639 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1640 EVT MemVT = LoadNode->getMemoryVT(); 1641 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1642 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, 1643 LoadNode->getPointerInfo(), MemVT, 1644 LoadNode->isVolatile(), 1645 LoadNode->isNonTemporal(), 1646 LoadNode->isInvariant(), 1647 LoadNode->getAlignment()); 1648 SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, 1649 DAG.getValueType(MemVT)); 1650 1651 SDValue MergedValues[2] = { Res, Chain }; 1652 return DAG.getMergeValues(MergedValues, DL); 1653 } 1654 1655 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1656 return SDValue(); 1657 } 1658 1659 // Lowering for indirect addressing 1660 const MachineFunction &MF = DAG.getMachineFunction(); 1661 const AMDGPUFrameLowering *TFL = 1662 static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); 1663 unsigned StackWidth = TFL->getStackWidth(MF); 1664 1665 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1666 1667 if (VT.isVector()) { 1668 unsigned NumElemVT = VT.getVectorNumElements(); 1669 EVT ElemVT = VT.getVectorElementType(); 1670 SDValue Loads[4]; 1671 1672 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1673 "vector width in load"); 1674 1675 for (unsigned i = 0; i < NumElemVT; ++i) { 1676 unsigned Channel, PtrIncr; 1677 getStackAddress(StackWidth, i, Channel, PtrIncr); 1678 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1679 DAG.getConstant(PtrIncr, DL, MVT::i32)); 1680 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1681 Chain, Ptr, 1682 DAG.getTargetConstant(Channel, DL, MVT::i32), 1683 Op.getOperand(2)); 1684 } 1685 for (unsigned i = NumElemVT; i < 4; ++i) { 1686 Loads[i] = DAG.getUNDEF(ElemVT); 1687 } 1688 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 1689 LoweredLoad = DAG.getBuildVector(TargetVT, DL, Loads); 1690 } else { 1691 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1692 Chain, Ptr, 1693 DAG.getTargetConstant(0, DL, MVT::i32), // Channel 1694 Op.getOperand(2)); 1695 } 1696 1697 SDValue Ops[2] = { 1698 LoweredLoad, 1699 Chain 1700 }; 1701 1702 return DAG.getMergeValues(Ops, DL); 1703 } 1704 1705 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1706 SDValue Chain = Op.getOperand(0); 1707 SDValue Cond = Op.getOperand(1); 1708 SDValue Jump = Op.getOperand(2); 1709 1710 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), 1711 Chain, Jump, Cond); 1712 } 1713 1714 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, 1715 SelectionDAG &DAG) const { 1716 MachineFunction &MF = DAG.getMachineFunction(); 1717 const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); 1718 1719 FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); 1720 1721 unsigned FrameIndex = FIN->getIndex(); 1722 unsigned IgnoredFrameReg; 1723 unsigned Offset = 1724 TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); 1725 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), 1726 Op.getValueType()); 1727 } 1728 1729 /// XXX Only kernel functions are supported, so we can assume for now that 1730 /// every function is a kernel function, but in the future we should use 1731 /// separate calling conventions for kernel and non-kernel functions. 1732 SDValue R600TargetLowering::LowerFormalArguments( 1733 SDValue Chain, 1734 CallingConv::ID CallConv, 1735 bool isVarArg, 1736 const SmallVectorImpl<ISD::InputArg> &Ins, 1737 SDLoc DL, SelectionDAG &DAG, 1738 SmallVectorImpl<SDValue> &InVals) const { 1739 SmallVector<CCValAssign, 16> ArgLocs; 1740 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1741 *DAG.getContext()); 1742 MachineFunction &MF = DAG.getMachineFunction(); 1743 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 1744 1745 SmallVector<ISD::InputArg, 8> LocalIns; 1746 1747 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); 1748 1749 AnalyzeFormalArguments(CCInfo, LocalIns); 1750 1751 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1752 CCValAssign &VA = ArgLocs[i]; 1753 const ISD::InputArg &In = Ins[i]; 1754 EVT VT = In.VT; 1755 EVT MemVT = VA.getLocVT(); 1756 if (!VT.isVector() && MemVT.isVector()) { 1757 // Get load source type if scalarized. 1758 MemVT = MemVT.getVectorElementType(); 1759 } 1760 1761 if (AMDGPU::isShader(CallConv)) { 1762 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); 1763 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1764 InVals.push_back(Register); 1765 continue; 1766 } 1767 1768 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1769 AMDGPUAS::CONSTANT_BUFFER_0); 1770 1771 // i64 isn't a legal type, so the register type used ends up as i32, which 1772 // isn't expected here. It attempts to create this sextload, but it ends up 1773 // being invalid. Somehow this seems to work with i64 arguments, but breaks 1774 // for <1 x i64>. 1775 1776 // The first 36 bytes of the input buffer contains information about 1777 // thread group and global sizes. 1778 ISD::LoadExtType Ext = ISD::NON_EXTLOAD; 1779 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { 1780 // FIXME: This should really check the extload type, but the handling of 1781 // extload vector parameters seems to be broken. 1782 1783 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 1784 Ext = ISD::SEXTLOAD; 1785 } 1786 1787 // Compute the offset from the value. 1788 // XXX - I think PartOffset should give you this, but it seems to give the 1789 // size of the register which isn't useful. 1790 1791 unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); 1792 unsigned PartOffset = VA.getLocMemOffset(); 1793 unsigned Offset = 36 + VA.getLocMemOffset(); 1794 1795 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); 1796 SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, 1797 DAG.getConstant(Offset, DL, MVT::i32), 1798 DAG.getUNDEF(MVT::i32), 1799 PtrInfo, 1800 MemVT, false, true, true, 4); 1801 1802 // 4 is the preferred alignment for the CONSTANT memory space. 1803 InVals.push_back(Arg); 1804 MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); 1805 } 1806 return Chain; 1807 } 1808 1809 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1810 EVT VT) const { 1811 if (!VT.isVector()) 1812 return MVT::i32; 1813 return VT.changeVectorElementTypeToInteger(); 1814 } 1815 1816 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 1817 unsigned AddrSpace, 1818 unsigned Align, 1819 bool *IsFast) const { 1820 if (IsFast) 1821 *IsFast = false; 1822 1823 if (!VT.isSimple() || VT == MVT::Other) 1824 return false; 1825 1826 if (VT.bitsLT(MVT::i32)) 1827 return false; 1828 1829 // TODO: This is a rough estimate. 1830 if (IsFast) 1831 *IsFast = true; 1832 1833 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 1834 } 1835 1836 static SDValue CompactSwizzlableVector( 1837 SelectionDAG &DAG, SDValue VectorEntry, 1838 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1839 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1840 assert(RemapSwizzle.empty()); 1841 SDValue NewBldVec[4] = { 1842 VectorEntry.getOperand(0), 1843 VectorEntry.getOperand(1), 1844 VectorEntry.getOperand(2), 1845 VectorEntry.getOperand(3) 1846 }; 1847 1848 for (unsigned i = 0; i < 4; i++) { 1849 if (NewBldVec[i].isUndef()) 1850 // We mask write here to teach later passes that the ith element of this 1851 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1852 // break false dependencies and additionnaly make assembly easier to read. 1853 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1854 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1855 if (C->isZero()) { 1856 RemapSwizzle[i] = 4; // SEL_0 1857 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1858 } else if (C->isExactlyValue(1.0)) { 1859 RemapSwizzle[i] = 5; // SEL_1 1860 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1861 } 1862 } 1863 1864 if (NewBldVec[i].isUndef()) 1865 continue; 1866 for (unsigned j = 0; j < i; j++) { 1867 if (NewBldVec[i] == NewBldVec[j]) { 1868 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1869 RemapSwizzle[i] = j; 1870 break; 1871 } 1872 } 1873 } 1874 1875 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1876 NewBldVec); 1877 } 1878 1879 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1880 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1881 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1882 assert(RemapSwizzle.empty()); 1883 SDValue NewBldVec[4] = { 1884 VectorEntry.getOperand(0), 1885 VectorEntry.getOperand(1), 1886 VectorEntry.getOperand(2), 1887 VectorEntry.getOperand(3) 1888 }; 1889 bool isUnmovable[4] = { false, false, false, false }; 1890 for (unsigned i = 0; i < 4; i++) { 1891 RemapSwizzle[i] = i; 1892 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1893 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1894 ->getZExtValue(); 1895 if (i == Idx) 1896 isUnmovable[Idx] = true; 1897 } 1898 } 1899 1900 for (unsigned i = 0; i < 4; i++) { 1901 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1902 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1903 ->getZExtValue(); 1904 if (isUnmovable[Idx]) 1905 continue; 1906 // Swap i and Idx 1907 std::swap(NewBldVec[Idx], NewBldVec[i]); 1908 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1909 break; 1910 } 1911 } 1912 1913 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1914 NewBldVec); 1915 } 1916 1917 1918 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, 1919 SDValue Swz[4], SelectionDAG &DAG, 1920 SDLoc DL) const { 1921 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 1922 // Old -> New swizzle values 1923 DenseMap<unsigned, unsigned> SwizzleRemap; 1924 1925 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1926 for (unsigned i = 0; i < 4; i++) { 1927 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1928 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1929 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1930 } 1931 1932 SwizzleRemap.clear(); 1933 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1934 for (unsigned i = 0; i < 4; i++) { 1935 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1936 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1937 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1938 } 1939 1940 return BuildVector; 1941 } 1942 1943 1944 //===----------------------------------------------------------------------===// 1945 // Custom DAG Optimizations 1946 //===----------------------------------------------------------------------===// 1947 1948 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1949 DAGCombinerInfo &DCI) const { 1950 SelectionDAG &DAG = DCI.DAG; 1951 1952 switch (N->getOpcode()) { 1953 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1954 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1955 case ISD::FP_ROUND: { 1956 SDValue Arg = N->getOperand(0); 1957 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1958 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1959 Arg.getOperand(0)); 1960 } 1961 break; 1962 } 1963 1964 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1965 // (i32 select_cc f32, f32, -1, 0 cc) 1966 // 1967 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1968 // this to one of the SET*_DX10 instructions. 1969 case ISD::FP_TO_SINT: { 1970 SDValue FNeg = N->getOperand(0); 1971 if (FNeg.getOpcode() != ISD::FNEG) { 1972 return SDValue(); 1973 } 1974 SDValue SelectCC = FNeg.getOperand(0); 1975 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1976 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1977 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1978 !isHWTrueValue(SelectCC.getOperand(2)) || 1979 !isHWFalseValue(SelectCC.getOperand(3))) { 1980 return SDValue(); 1981 } 1982 1983 SDLoc dl(N); 1984 return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), 1985 SelectCC.getOperand(0), // LHS 1986 SelectCC.getOperand(1), // RHS 1987 DAG.getConstant(-1, dl, MVT::i32), // True 1988 DAG.getConstant(0, dl, MVT::i32), // False 1989 SelectCC.getOperand(4)); // CC 1990 1991 break; 1992 } 1993 1994 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1995 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1996 case ISD::INSERT_VECTOR_ELT: { 1997 SDValue InVec = N->getOperand(0); 1998 SDValue InVal = N->getOperand(1); 1999 SDValue EltNo = N->getOperand(2); 2000 SDLoc dl(N); 2001 2002 // If the inserted element is an UNDEF, just use the input vector. 2003 if (InVal.isUndef()) 2004 return InVec; 2005 2006 EVT VT = InVec.getValueType(); 2007 2008 // If we can't generate a legal BUILD_VECTOR, exit 2009 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 2010 return SDValue(); 2011 2012 // Check that we know which element is being inserted 2013 if (!isa<ConstantSDNode>(EltNo)) 2014 return SDValue(); 2015 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 2016 2017 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 2018 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 2019 // vector elements. 2020 SmallVector<SDValue, 8> Ops; 2021 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 2022 Ops.append(InVec.getNode()->op_begin(), 2023 InVec.getNode()->op_end()); 2024 } else if (InVec.isUndef()) { 2025 unsigned NElts = VT.getVectorNumElements(); 2026 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 2027 } else { 2028 return SDValue(); 2029 } 2030 2031 // Insert the element 2032 if (Elt < Ops.size()) { 2033 // All the operands of BUILD_VECTOR must have the same type; 2034 // we enforce that here. 2035 EVT OpVT = Ops[0].getValueType(); 2036 if (InVal.getValueType() != OpVT) 2037 InVal = OpVT.bitsGT(InVal.getValueType()) ? 2038 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : 2039 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); 2040 Ops[Elt] = InVal; 2041 } 2042 2043 // Return the new vector 2044 return DAG.getBuildVector(VT, dl, Ops); 2045 } 2046 2047 // Extract_vec (Build_vector) generated by custom lowering 2048 // also needs to be customly combined 2049 case ISD::EXTRACT_VECTOR_ELT: { 2050 SDValue Arg = N->getOperand(0); 2051 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 2052 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 2053 unsigned Element = Const->getZExtValue(); 2054 return Arg->getOperand(Element); 2055 } 2056 } 2057 if (Arg.getOpcode() == ISD::BITCAST && 2058 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 2059 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 2060 unsigned Element = Const->getZExtValue(); 2061 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 2062 Arg->getOperand(0).getOperand(Element)); 2063 } 2064 } 2065 break; 2066 } 2067 2068 case ISD::SELECT_CC: { 2069 // Try common optimizations 2070 if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI)) 2071 return Ret; 2072 2073 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 2074 // selectcc x, y, a, b, inv(cc) 2075 // 2076 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 2077 // selectcc x, y, a, b, cc 2078 SDValue LHS = N->getOperand(0); 2079 if (LHS.getOpcode() != ISD::SELECT_CC) { 2080 return SDValue(); 2081 } 2082 2083 SDValue RHS = N->getOperand(1); 2084 SDValue True = N->getOperand(2); 2085 SDValue False = N->getOperand(3); 2086 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 2087 2088 if (LHS.getOperand(2).getNode() != True.getNode() || 2089 LHS.getOperand(3).getNode() != False.getNode() || 2090 RHS.getNode() != False.getNode()) { 2091 return SDValue(); 2092 } 2093 2094 switch (NCC) { 2095 default: return SDValue(); 2096 case ISD::SETNE: return LHS; 2097 case ISD::SETEQ: { 2098 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 2099 LHSCC = ISD::getSetCCInverse(LHSCC, 2100 LHS.getOperand(0).getValueType().isInteger()); 2101 if (DCI.isBeforeLegalizeOps() || 2102 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 2103 return DAG.getSelectCC(SDLoc(N), 2104 LHS.getOperand(0), 2105 LHS.getOperand(1), 2106 LHS.getOperand(2), 2107 LHS.getOperand(3), 2108 LHSCC); 2109 break; 2110 } 2111 } 2112 return SDValue(); 2113 } 2114 2115 case AMDGPUISD::EXPORT: { 2116 SDValue Arg = N->getOperand(1); 2117 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2118 break; 2119 2120 SDValue NewArgs[8] = { 2121 N->getOperand(0), // Chain 2122 SDValue(), 2123 N->getOperand(2), // ArrayBase 2124 N->getOperand(3), // Type 2125 N->getOperand(4), // SWZ_X 2126 N->getOperand(5), // SWZ_Y 2127 N->getOperand(6), // SWZ_Z 2128 N->getOperand(7) // SWZ_W 2129 }; 2130 SDLoc DL(N); 2131 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); 2132 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); 2133 } 2134 case AMDGPUISD::TEXTURE_FETCH: { 2135 SDValue Arg = N->getOperand(1); 2136 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2137 break; 2138 2139 SDValue NewArgs[19] = { 2140 N->getOperand(0), 2141 N->getOperand(1), 2142 N->getOperand(2), 2143 N->getOperand(3), 2144 N->getOperand(4), 2145 N->getOperand(5), 2146 N->getOperand(6), 2147 N->getOperand(7), 2148 N->getOperand(8), 2149 N->getOperand(9), 2150 N->getOperand(10), 2151 N->getOperand(11), 2152 N->getOperand(12), 2153 N->getOperand(13), 2154 N->getOperand(14), 2155 N->getOperand(15), 2156 N->getOperand(16), 2157 N->getOperand(17), 2158 N->getOperand(18), 2159 }; 2160 SDLoc DL(N); 2161 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); 2162 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); 2163 } 2164 } 2165 2166 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 2167 } 2168 2169 static bool 2170 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, 2171 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { 2172 const R600InstrInfo *TII = 2173 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); 2174 if (!Src.isMachineOpcode()) 2175 return false; 2176 switch (Src.getMachineOpcode()) { 2177 case AMDGPU::FNEG_R600: 2178 if (!Neg.getNode()) 2179 return false; 2180 Src = Src.getOperand(0); 2181 Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2182 return true; 2183 case AMDGPU::FABS_R600: 2184 if (!Abs.getNode()) 2185 return false; 2186 Src = Src.getOperand(0); 2187 Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2188 return true; 2189 case AMDGPU::CONST_COPY: { 2190 unsigned Opcode = ParentNode->getMachineOpcode(); 2191 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2192 2193 if (!Sel.getNode()) 2194 return false; 2195 2196 SDValue CstOffset = Src.getOperand(0); 2197 if (ParentNode->getValueType(0).isVector()) 2198 return false; 2199 2200 // Gather constants values 2201 int SrcIndices[] = { 2202 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2203 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2204 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), 2205 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2206 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2207 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2208 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2209 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2210 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2211 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2212 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2213 }; 2214 std::vector<unsigned> Consts; 2215 for (int OtherSrcIdx : SrcIndices) { 2216 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 2217 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 2218 continue; 2219 if (HasDst) { 2220 OtherSrcIdx--; 2221 OtherSelIdx--; 2222 } 2223 if (RegisterSDNode *Reg = 2224 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 2225 if (Reg->getReg() == AMDGPU::ALU_CONST) { 2226 ConstantSDNode *Cst 2227 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); 2228 Consts.push_back(Cst->getZExtValue()); 2229 } 2230 } 2231 } 2232 2233 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); 2234 Consts.push_back(Cst->getZExtValue()); 2235 if (!TII->fitsConstReadLimitations(Consts)) { 2236 return false; 2237 } 2238 2239 Sel = CstOffset; 2240 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); 2241 return true; 2242 } 2243 case AMDGPU::MOV_IMM_I32: 2244 case AMDGPU::MOV_IMM_F32: { 2245 unsigned ImmReg = AMDGPU::ALU_LITERAL_X; 2246 uint64_t ImmValue = 0; 2247 2248 2249 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { 2250 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 2251 float FloatValue = FPC->getValueAPF().convertToFloat(); 2252 if (FloatValue == 0.0) { 2253 ImmReg = AMDGPU::ZERO; 2254 } else if (FloatValue == 0.5) { 2255 ImmReg = AMDGPU::HALF; 2256 } else if (FloatValue == 1.0) { 2257 ImmReg = AMDGPU::ONE; 2258 } else { 2259 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 2260 } 2261 } else { 2262 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); 2263 uint64_t Value = C->getZExtValue(); 2264 if (Value == 0) { 2265 ImmReg = AMDGPU::ZERO; 2266 } else if (Value == 1) { 2267 ImmReg = AMDGPU::ONE_INT; 2268 } else { 2269 ImmValue = Value; 2270 } 2271 } 2272 2273 // Check that we aren't already using an immediate. 2274 // XXX: It's possible for an instruction to have more than one 2275 // immediate operand, but this is not supported yet. 2276 if (ImmReg == AMDGPU::ALU_LITERAL_X) { 2277 if (!Imm.getNode()) 2278 return false; 2279 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); 2280 assert(C); 2281 if (C->getZExtValue()) 2282 return false; 2283 Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); 2284 } 2285 Src = DAG.getRegister(ImmReg, MVT::i32); 2286 return true; 2287 } 2288 default: 2289 return false; 2290 } 2291 } 2292 2293 2294 /// \brief Fold the instructions after selecting them 2295 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 2296 SelectionDAG &DAG) const { 2297 const R600InstrInfo *TII = 2298 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); 2299 if (!Node->isMachineOpcode()) 2300 return Node; 2301 unsigned Opcode = Node->getMachineOpcode(); 2302 SDValue FakeOp; 2303 2304 std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); 2305 2306 if (Opcode == AMDGPU::DOT_4) { 2307 int OperandIdx[] = { 2308 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2309 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2310 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2311 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2312 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2313 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2314 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2315 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2316 }; 2317 int NegIdx[] = { 2318 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), 2319 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), 2320 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), 2321 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), 2322 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), 2323 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), 2324 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), 2325 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) 2326 }; 2327 int AbsIdx[] = { 2328 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), 2329 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), 2330 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), 2331 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), 2332 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), 2333 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), 2334 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), 2335 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) 2336 }; 2337 for (unsigned i = 0; i < 8; i++) { 2338 if (OperandIdx[i] < 0) 2339 return Node; 2340 SDValue &Src = Ops[OperandIdx[i] - 1]; 2341 SDValue &Neg = Ops[NegIdx[i] - 1]; 2342 SDValue &Abs = Ops[AbsIdx[i] - 1]; 2343 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2344 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2345 if (HasDst) 2346 SelIdx--; 2347 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2348 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 2349 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2350 } 2351 } else if (Opcode == AMDGPU::REG_SEQUENCE) { 2352 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 2353 SDValue &Src = Ops[i]; 2354 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 2355 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2356 } 2357 } else if (Opcode == AMDGPU::CLAMP_R600) { 2358 SDValue Src = Node->getOperand(0); 2359 if (!Src.isMachineOpcode() || 2360 !TII->hasInstrModifiers(Src.getMachineOpcode())) 2361 return Node; 2362 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), 2363 AMDGPU::OpName::clamp); 2364 if (ClampIdx < 0) 2365 return Node; 2366 SDLoc DL(Node); 2367 std::vector<SDValue> Ops(Src->op_begin(), Src->op_end()); 2368 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); 2369 return DAG.getMachineNode(Src.getMachineOpcode(), DL, 2370 Node->getVTList(), Ops); 2371 } else { 2372 if (!TII->hasInstrModifiers(Opcode)) 2373 return Node; 2374 int OperandIdx[] = { 2375 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2376 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2377 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) 2378 }; 2379 int NegIdx[] = { 2380 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), 2381 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), 2382 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) 2383 }; 2384 int AbsIdx[] = { 2385 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), 2386 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), 2387 -1 2388 }; 2389 for (unsigned i = 0; i < 3; i++) { 2390 if (OperandIdx[i] < 0) 2391 return Node; 2392 SDValue &Src = Ops[OperandIdx[i] - 1]; 2393 SDValue &Neg = Ops[NegIdx[i] - 1]; 2394 SDValue FakeAbs; 2395 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 2396 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2397 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2398 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); 2399 if (HasDst) { 2400 SelIdx--; 2401 ImmIdx--; 2402 } 2403 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2404 SDValue &Imm = Ops[ImmIdx]; 2405 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 2406 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2407 } 2408 } 2409 2410 return Node; 2411 } 2412