1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for R600
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "R600ISelLowering.h"
16 #include "AMDGPUFrameLowering.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "R600Defines.h"
20 #include "R600InstrInfo.h"
21 #include "R600MachineFunctionInfo.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/CallingConvLower.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/IR/Argument.h"
29 #include "llvm/IR/Function.h"
30 
31 using namespace llvm;
32 
33 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
34                                        const R600Subtarget &STI)
35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
36   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
37   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
38   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
39   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
40   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
41   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
42 
43   computeRegisterProperties(STI.getRegisterInfo());
44 
45   // Legalize loads and stores to the private address space.
46   setOperationAction(ISD::LOAD, MVT::i32, Custom);
47   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
48   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
49 
50   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
51   // spaces, so it is custom lowered to handle those where it isn't.
52   for (MVT VT : MVT::integer_valuetypes()) {
53     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
54     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
55     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
56 
57     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
58     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
59     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
60 
61     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
62     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
63     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
64   }
65 
66   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
67   setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
68   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
69   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
70 
71   setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
72   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
73   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
74 
75 
76   setOperationAction(ISD::STORE, MVT::i8, Custom);
77   setOperationAction(ISD::STORE, MVT::i32, Custom);
78   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
79   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
80 
81   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
82   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
83 
84   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
85   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
86   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
87 
88   // Set condition code actions
89   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
90   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
91   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
92   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
93   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
94   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
95   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
96   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
97   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
98   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
99   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
100   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
101 
102   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
103   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
104   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
105   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
106 
107   setOperationAction(ISD::FCOS, MVT::f32, Custom);
108   setOperationAction(ISD::FSIN, MVT::f32, Custom);
109 
110   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
111   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
112 
113   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
114   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
115   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
116 
117   setOperationAction(ISD::FSUB, MVT::f32, Expand);
118 
119   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
120   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
121 
122   setOperationAction(ISD::SETCC, MVT::i32, Expand);
123   setOperationAction(ISD::SETCC, MVT::f32, Expand);
124   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
125   setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
126   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
127   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
128 
129   setOperationAction(ISD::SELECT, MVT::i32, Expand);
130   setOperationAction(ISD::SELECT, MVT::f32, Expand);
131   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
132   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
133 
134   // ADD, SUB overflow.
135   // TODO: turn these into Legal?
136   if (Subtarget->hasCARRY())
137     setOperationAction(ISD::UADDO, MVT::i32, Custom);
138 
139   if (Subtarget->hasBORROW())
140     setOperationAction(ISD::USUBO, MVT::i32, Custom);
141 
142   // Expand sign extension of vectors
143   if (!Subtarget->hasBFE())
144     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
145 
146   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
147   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
148 
149   if (!Subtarget->hasBFE())
150     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
151   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
152   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
153 
154   if (!Subtarget->hasBFE())
155     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
156   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
157   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
158 
159   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
160   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
161   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
162 
163   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
164 
165   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
166 
167   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
168   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
169   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
170   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
171 
172   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
173   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
174   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
175   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
176 
177   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
178   //  to be Legal/Custom in order to avoid library calls.
179   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
180   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
181   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
182 
183   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
184 
185   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
186   for (MVT VT : ScalarIntVTs) {
187     setOperationAction(ISD::ADDC, VT, Expand);
188     setOperationAction(ISD::SUBC, VT, Expand);
189     setOperationAction(ISD::ADDE, VT, Expand);
190     setOperationAction(ISD::SUBE, VT, Expand);
191   }
192 
193   setSchedulingPreference(Sched::Source);
194 
195 
196   setTargetDAGCombine(ISD::FP_ROUND);
197   setTargetDAGCombine(ISD::FP_TO_SINT);
198   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
199   setTargetDAGCombine(ISD::SELECT_CC);
200   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
201   setTargetDAGCombine(ISD::LOAD);
202 }
203 
204 const R600Subtarget *R600TargetLowering::getSubtarget() const {
205   return static_cast<const R600Subtarget *>(Subtarget);
206 }
207 
208 static inline bool isEOP(MachineBasicBlock::iterator I) {
209   if (std::next(I) == I->getParent()->end())
210     return false;
211   return std::next(I)->getOpcode() == AMDGPU::RETURN;
212 }
213 
214 MachineBasicBlock *
215 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
216                                                 MachineBasicBlock *BB) const {
217   MachineFunction * MF = BB->getParent();
218   MachineRegisterInfo &MRI = MF->getRegInfo();
219   MachineBasicBlock::iterator I = MI;
220   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
221 
222   switch (MI.getOpcode()) {
223   default:
224     // Replace LDS_*_RET instruction that don't have any uses with the
225     // equivalent LDS_*_NORET instruction.
226     if (TII->isLDSRetInstr(MI.getOpcode())) {
227       int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
228       assert(DstIdx != -1);
229       MachineInstrBuilder NewMI;
230       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
231       //        LDS_1A2D support and remove this special case.
232       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
233           MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
234         return BB;
235 
236       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
237                       TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
238       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
239         NewMI.addOperand(MI.getOperand(i));
240       }
241     } else {
242       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
243     }
244     break;
245   case AMDGPU::CLAMP_R600: {
246     MachineInstr *NewMI = TII->buildDefaultInstruction(
247         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
248         MI.getOperand(1).getReg());
249     TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
250     break;
251   }
252 
253   case AMDGPU::FABS_R600: {
254     MachineInstr *NewMI = TII->buildDefaultInstruction(
255         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
256         MI.getOperand(1).getReg());
257     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
258     break;
259   }
260 
261   case AMDGPU::FNEG_R600: {
262     MachineInstr *NewMI = TII->buildDefaultInstruction(
263         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
264         MI.getOperand(1).getReg());
265     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
266     break;
267   }
268 
269   case AMDGPU::MASK_WRITE: {
270     unsigned maskedRegister = MI.getOperand(0).getReg();
271     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
272     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
273     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
274     break;
275   }
276 
277   case AMDGPU::MOV_IMM_F32:
278     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
279                                                             .getFPImm()
280                                                             ->getValueAPF()
281                                                             .bitcastToAPInt()
282                                                             .getZExtValue());
283     break;
284   case AMDGPU::MOV_IMM_I32:
285     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
286                      MI.getOperand(1).getImm());
287     break;
288   case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
289     //TODO: Perhaps combine this instruction with the next if possible
290     auto MIB = TII->buildDefaultInstruction(
291         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
292     int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
293     //TODO: Ugh this is rather ugly
294     MIB->getOperand(Idx) = MI.getOperand(1);
295     break;
296   }
297   case AMDGPU::CONST_COPY: {
298     MachineInstr *NewMI = TII->buildDefaultInstruction(
299         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
300     TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
301                        MI.getOperand(1).getImm());
302     break;
303   }
304 
305   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
306   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
307   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
308     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
309         .addOperand(MI.getOperand(0))
310         .addOperand(MI.getOperand(1))
311         .addImm(isEOP(I)); // Set End of program bit
312     break;
313   }
314   case AMDGPU::RAT_STORE_TYPED_eg: {
315     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
316         .addOperand(MI.getOperand(0))
317         .addOperand(MI.getOperand(1))
318         .addOperand(MI.getOperand(2))
319         .addImm(isEOP(I)); // Set End of program bit
320     break;
321   }
322   case AMDGPU::BRANCH:
323     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
324         .addOperand(MI.getOperand(0));
325     break;
326 
327   case AMDGPU::BRANCH_COND_f32: {
328     MachineInstr *NewMI =
329         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
330                 AMDGPU::PREDICATE_BIT)
331             .addOperand(MI.getOperand(1))
332             .addImm(AMDGPU::PRED_SETNE)
333             .addImm(0); // Flags
334     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
335     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
336         .addOperand(MI.getOperand(0))
337         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
338     break;
339   }
340 
341   case AMDGPU::BRANCH_COND_i32: {
342     MachineInstr *NewMI =
343         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
344                 AMDGPU::PREDICATE_BIT)
345             .addOperand(MI.getOperand(1))
346             .addImm(AMDGPU::PRED_SETNE_INT)
347             .addImm(0); // Flags
348     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
349     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
350         .addOperand(MI.getOperand(0))
351         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
352     break;
353   }
354 
355   case AMDGPU::EG_ExportSwz:
356   case AMDGPU::R600_ExportSwz: {
357     // Instruction is left unmodified if its not the last one of its type
358     bool isLastInstructionOfItsType = true;
359     unsigned InstExportType = MI.getOperand(1).getImm();
360     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
361          EndBlock = BB->end(); NextExportInst != EndBlock;
362          NextExportInst = std::next(NextExportInst)) {
363       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
364           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
365         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
366             .getImm();
367         if (CurrentInstExportType == InstExportType) {
368           isLastInstructionOfItsType = false;
369           break;
370         }
371       }
372     }
373     bool EOP = isEOP(I);
374     if (!EOP && !isLastInstructionOfItsType)
375       return BB;
376     unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
377     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
378         .addOperand(MI.getOperand(0))
379         .addOperand(MI.getOperand(1))
380         .addOperand(MI.getOperand(2))
381         .addOperand(MI.getOperand(3))
382         .addOperand(MI.getOperand(4))
383         .addOperand(MI.getOperand(5))
384         .addOperand(MI.getOperand(6))
385         .addImm(CfInst)
386         .addImm(EOP);
387     break;
388   }
389   case AMDGPU::RETURN: {
390     return BB;
391   }
392   }
393 
394   MI.eraseFromParent();
395   return BB;
396 }
397 
398 //===----------------------------------------------------------------------===//
399 // Custom DAG Lowering Operations
400 //===----------------------------------------------------------------------===//
401 
402 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
403   MachineFunction &MF = DAG.getMachineFunction();
404   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
405   switch (Op.getOpcode()) {
406   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
407   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
408   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
409   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
410   case ISD::SRA_PARTS:
411   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
412   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
413   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
414   case ISD::FCOS:
415   case ISD::FSIN: return LowerTrig(Op, DAG);
416   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
417   case ISD::STORE: return LowerSTORE(Op, DAG);
418   case ISD::LOAD: {
419     SDValue Result = LowerLOAD(Op, DAG);
420     assert((!Result.getNode() ||
421             Result.getNode()->getNumValues() == 2) &&
422            "Load should return a value and a chain");
423     return Result;
424   }
425 
426   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
427   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
428   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
429   case ISD::INTRINSIC_VOID: {
430     SDValue Chain = Op.getOperand(0);
431     unsigned IntrinsicID =
432                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
433     switch (IntrinsicID) {
434     case AMDGPUIntrinsic::r600_store_swizzle: {
435       SDLoc DL(Op);
436       const SDValue Args[8] = {
437         Chain,
438         Op.getOperand(2), // Export Value
439         Op.getOperand(3), // ArrayBase
440         Op.getOperand(4), // Type
441         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
442         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
443         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
444         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
445       };
446       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
447     }
448 
449     // default for switch(IntrinsicID)
450     default: break;
451     }
452     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
453     break;
454   }
455   case ISD::INTRINSIC_WO_CHAIN: {
456     unsigned IntrinsicID =
457                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
458     EVT VT = Op.getValueType();
459     SDLoc DL(Op);
460     switch(IntrinsicID) {
461     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
462     case AMDGPUIntrinsic::r600_tex:
463     case AMDGPUIntrinsic::r600_texc: {
464       unsigned TextureOp;
465       switch (IntrinsicID) {
466       case AMDGPUIntrinsic::r600_tex:
467         TextureOp = 0;
468         break;
469       case AMDGPUIntrinsic::r600_texc:
470         TextureOp = 1;
471         break;
472       default:
473         llvm_unreachable("unhandled texture operation");
474       }
475 
476       SDValue TexArgs[19] = {
477         DAG.getConstant(TextureOp, DL, MVT::i32),
478         Op.getOperand(1),
479         DAG.getConstant(0, DL, MVT::i32),
480         DAG.getConstant(1, DL, MVT::i32),
481         DAG.getConstant(2, DL, MVT::i32),
482         DAG.getConstant(3, DL, MVT::i32),
483         Op.getOperand(2),
484         Op.getOperand(3),
485         Op.getOperand(4),
486         DAG.getConstant(0, DL, MVT::i32),
487         DAG.getConstant(1, DL, MVT::i32),
488         DAG.getConstant(2, DL, MVT::i32),
489         DAG.getConstant(3, DL, MVT::i32),
490         Op.getOperand(5),
491         Op.getOperand(6),
492         Op.getOperand(7),
493         Op.getOperand(8),
494         Op.getOperand(9),
495         Op.getOperand(10)
496       };
497       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
498     }
499     case AMDGPUIntrinsic::r600_dot4: {
500       SDValue Args[8] = {
501       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
502           DAG.getConstant(0, DL, MVT::i32)),
503       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
504           DAG.getConstant(0, DL, MVT::i32)),
505       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
506           DAG.getConstant(1, DL, MVT::i32)),
507       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
508           DAG.getConstant(1, DL, MVT::i32)),
509       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
510           DAG.getConstant(2, DL, MVT::i32)),
511       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
512           DAG.getConstant(2, DL, MVT::i32)),
513       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
514           DAG.getConstant(3, DL, MVT::i32)),
515       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
516           DAG.getConstant(3, DL, MVT::i32))
517       };
518       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
519     }
520 
521     case Intrinsic::r600_implicitarg_ptr: {
522       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
523       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
524       return DAG.getConstant(ByteOffset, DL, PtrVT);
525     }
526     case Intrinsic::r600_read_ngroups_x:
527       return LowerImplicitParameter(DAG, VT, DL, 0);
528     case Intrinsic::r600_read_ngroups_y:
529       return LowerImplicitParameter(DAG, VT, DL, 1);
530     case Intrinsic::r600_read_ngroups_z:
531       return LowerImplicitParameter(DAG, VT, DL, 2);
532     case Intrinsic::r600_read_global_size_x:
533       return LowerImplicitParameter(DAG, VT, DL, 3);
534     case Intrinsic::r600_read_global_size_y:
535       return LowerImplicitParameter(DAG, VT, DL, 4);
536     case Intrinsic::r600_read_global_size_z:
537       return LowerImplicitParameter(DAG, VT, DL, 5);
538     case Intrinsic::r600_read_local_size_x:
539       return LowerImplicitParameter(DAG, VT, DL, 6);
540     case Intrinsic::r600_read_local_size_y:
541       return LowerImplicitParameter(DAG, VT, DL, 7);
542     case Intrinsic::r600_read_local_size_z:
543       return LowerImplicitParameter(DAG, VT, DL, 8);
544 
545     case Intrinsic::r600_read_tgid_x:
546       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
547                                   AMDGPU::T1_X, VT);
548     case Intrinsic::r600_read_tgid_y:
549       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
550                                   AMDGPU::T1_Y, VT);
551     case Intrinsic::r600_read_tgid_z:
552       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
553                                   AMDGPU::T1_Z, VT);
554     case Intrinsic::r600_read_tidig_x:
555       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
556                                   AMDGPU::T0_X, VT);
557     case Intrinsic::r600_read_tidig_y:
558       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
559                                   AMDGPU::T0_Y, VT);
560     case Intrinsic::r600_read_tidig_z:
561       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
562                                   AMDGPU::T0_Z, VT);
563 
564     case Intrinsic::r600_recipsqrt_ieee:
565       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
566 
567     case Intrinsic::r600_recipsqrt_clamped:
568       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
569     }
570 
571     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
572     break;
573   }
574   } // end switch(Op.getOpcode())
575   return SDValue();
576 }
577 
578 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
579                                             SmallVectorImpl<SDValue> &Results,
580                                             SelectionDAG &DAG) const {
581   switch (N->getOpcode()) {
582   default:
583     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
584     return;
585   case ISD::FP_TO_UINT:
586     if (N->getValueType(0) == MVT::i1) {
587       Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
588       return;
589     }
590     // Since we don't care about out of bounds values we can use FP_TO_SINT for
591     // uints too. The DAGLegalizer code for uint considers some extra cases
592     // which are not necessary here.
593     LLVM_FALLTHROUGH;
594   case ISD::FP_TO_SINT: {
595     if (N->getValueType(0) == MVT::i1) {
596       Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
597       return;
598     }
599 
600     SDValue Result;
601     if (expandFP_TO_SINT(N, Result, DAG))
602       Results.push_back(Result);
603     return;
604   }
605   case ISD::SDIVREM: {
606     SDValue Op = SDValue(N, 1);
607     SDValue RES = LowerSDIVREM(Op, DAG);
608     Results.push_back(RES);
609     Results.push_back(RES.getValue(1));
610     break;
611   }
612   case ISD::UDIVREM: {
613     SDValue Op = SDValue(N, 0);
614     LowerUDIVREM64(Op, DAG, Results);
615     break;
616   }
617   }
618 }
619 
620 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
621                                                    SDValue Vector) const {
622 
623   SDLoc DL(Vector);
624   EVT VecVT = Vector.getValueType();
625   EVT EltVT = VecVT.getVectorElementType();
626   SmallVector<SDValue, 8> Args;
627 
628   for (unsigned i = 0, e = VecVT.getVectorNumElements();
629                                                            i != e; ++i) {
630     Args.push_back(DAG.getNode(
631         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
632         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
633   }
634 
635   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
636 }
637 
638 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
639                                                     SelectionDAG &DAG) const {
640 
641   SDLoc DL(Op);
642   SDValue Vector = Op.getOperand(0);
643   SDValue Index = Op.getOperand(1);
644 
645   if (isa<ConstantSDNode>(Index) ||
646       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
647     return Op;
648 
649   Vector = vectorToVerticalVector(DAG, Vector);
650   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
651                      Vector, Index);
652 }
653 
654 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
655                                                    SelectionDAG &DAG) const {
656   SDLoc DL(Op);
657   SDValue Vector = Op.getOperand(0);
658   SDValue Value = Op.getOperand(1);
659   SDValue Index = Op.getOperand(2);
660 
661   if (isa<ConstantSDNode>(Index) ||
662       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
663     return Op;
664 
665   Vector = vectorToVerticalVector(DAG, Vector);
666   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
667                                Vector, Value, Index);
668   return vectorToVerticalVector(DAG, Insert);
669 }
670 
671 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
672                                                SDValue Op,
673                                                SelectionDAG &DAG) const {
674 
675   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
676   if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
677     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
678 
679   const DataLayout &DL = DAG.getDataLayout();
680   const GlobalValue *GV = GSD->getGlobal();
681   MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
682 
683   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
684   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
685 }
686 
687 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
688   // On hw >= R700, COS/SIN input must be between -1. and 1.
689   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
690   EVT VT = Op.getValueType();
691   SDValue Arg = Op.getOperand(0);
692   SDLoc DL(Op);
693 
694   // TODO: Should this propagate fast-math-flags?
695   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
696       DAG.getNode(ISD::FADD, DL, VT,
697         DAG.getNode(ISD::FMUL, DL, VT, Arg,
698           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
699         DAG.getConstantFP(0.5, DL, MVT::f32)));
700   unsigned TrigNode;
701   switch (Op.getOpcode()) {
702   case ISD::FCOS:
703     TrigNode = AMDGPUISD::COS_HW;
704     break;
705   case ISD::FSIN:
706     TrigNode = AMDGPUISD::SIN_HW;
707     break;
708   default:
709     llvm_unreachable("Wrong trig opcode");
710   }
711   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
712       DAG.getNode(ISD::FADD, DL, VT, FractPart,
713         DAG.getConstantFP(-0.5, DL, MVT::f32)));
714   if (Gen >= R600Subtarget::R700)
715     return TrigVal;
716   // On R600 hw, COS/SIN input must be between -Pi and Pi.
717   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
718       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
719 }
720 
721 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
722   SDLoc DL(Op);
723   EVT VT = Op.getValueType();
724 
725   SDValue Lo = Op.getOperand(0);
726   SDValue Hi = Op.getOperand(1);
727   SDValue Shift = Op.getOperand(2);
728   SDValue Zero = DAG.getConstant(0, DL, VT);
729   SDValue One  = DAG.getConstant(1, DL, VT);
730 
731   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
732   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
733   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
734   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
735 
736   // The dance around Width1 is necessary for 0 special case.
737   // Without it the CompShift might be 32, producing incorrect results in
738   // Overflow. So we do the shift in two steps, the alternative is to
739   // add a conditional to filter the special case.
740 
741   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
742   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
743 
744   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
745   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
746   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
747 
748   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
749   SDValue LoBig = Zero;
750 
751   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
752   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
753 
754   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
755 }
756 
757 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
758   SDLoc DL(Op);
759   EVT VT = Op.getValueType();
760 
761   SDValue Lo = Op.getOperand(0);
762   SDValue Hi = Op.getOperand(1);
763   SDValue Shift = Op.getOperand(2);
764   SDValue Zero = DAG.getConstant(0, DL, VT);
765   SDValue One  = DAG.getConstant(1, DL, VT);
766 
767   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
768 
769   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
770   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
771   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
772   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
773 
774   // The dance around Width1 is necessary for 0 special case.
775   // Without it the CompShift might be 32, producing incorrect results in
776   // Overflow. So we do the shift in two steps, the alternative is to
777   // add a conditional to filter the special case.
778 
779   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
780   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
781 
782   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
783   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
784   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
785 
786   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
787   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
788 
789   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
790   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
791 
792   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
793 }
794 
795 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
796                                           unsigned mainop, unsigned ovf) const {
797   SDLoc DL(Op);
798   EVT VT = Op.getValueType();
799 
800   SDValue Lo = Op.getOperand(0);
801   SDValue Hi = Op.getOperand(1);
802 
803   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
804   // Extend sign.
805   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
806                     DAG.getValueType(MVT::i1));
807 
808   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
809 
810   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
811 }
812 
813 SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
814   SDLoc DL(Op);
815   return DAG.getNode(
816       ISD::SETCC,
817       DL,
818       MVT::i1,
819       Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
820       DAG.getCondCode(ISD::SETEQ));
821 }
822 
823 SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
824   SDLoc DL(Op);
825   return DAG.getNode(
826       ISD::SETCC,
827       DL,
828       MVT::i1,
829       Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
830       DAG.getCondCode(ISD::SETEQ));
831 }
832 
833 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
834                                                    const SDLoc &DL,
835                                                    unsigned DwordOffset) const {
836   unsigned ByteOffset = DwordOffset * 4;
837   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
838                                       AMDGPUAS::CONSTANT_BUFFER_0);
839 
840   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
841   assert(isInt<16>(ByteOffset));
842 
843   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
844                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
845                      MachinePointerInfo(ConstantPointerNull::get(PtrType)));
846 }
847 
848 bool R600TargetLowering::isZero(SDValue Op) const {
849   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
850     return Cst->isNullValue();
851   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
852     return CstFP->isZero();
853   } else {
854     return false;
855   }
856 }
857 
858 bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
859   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
860     return CFP->isExactlyValue(1.0);
861   }
862   return isAllOnesConstant(Op);
863 }
864 
865 bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
866   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
867     return CFP->getValueAPF().isZero();
868   }
869   return isNullConstant(Op);
870 }
871 
872 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
873   SDLoc DL(Op);
874   EVT VT = Op.getValueType();
875 
876   SDValue LHS = Op.getOperand(0);
877   SDValue RHS = Op.getOperand(1);
878   SDValue True = Op.getOperand(2);
879   SDValue False = Op.getOperand(3);
880   SDValue CC = Op.getOperand(4);
881   SDValue Temp;
882 
883   if (VT == MVT::f32) {
884     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
885     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
886     if (MinMax)
887       return MinMax;
888   }
889 
890   // LHS and RHS are guaranteed to be the same value type
891   EVT CompareVT = LHS.getValueType();
892 
893   // Check if we can lower this to a native operation.
894 
895   // Try to lower to a SET* instruction:
896   //
897   // SET* can match the following patterns:
898   //
899   // select_cc f32, f32, -1,  0, cc_supported
900   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
901   // select_cc i32, i32, -1,  0, cc_supported
902   //
903 
904   // Move hardware True/False values to the correct operand.
905   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
906   ISD::CondCode InverseCC =
907      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
908   if (isHWTrueValue(False) && isHWFalseValue(True)) {
909     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
910       std::swap(False, True);
911       CC = DAG.getCondCode(InverseCC);
912     } else {
913       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
914       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
915         std::swap(False, True);
916         std::swap(LHS, RHS);
917         CC = DAG.getCondCode(SwapInvCC);
918       }
919     }
920   }
921 
922   if (isHWTrueValue(True) && isHWFalseValue(False) &&
923       (CompareVT == VT || VT == MVT::i32)) {
924     // This can be matched by a SET* instruction.
925     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
926   }
927 
928   // Try to lower to a CND* instruction:
929   //
930   // CND* can match the following patterns:
931   //
932   // select_cc f32, 0.0, f32, f32, cc_supported
933   // select_cc f32, 0.0, i32, i32, cc_supported
934   // select_cc i32, 0,   f32, f32, cc_supported
935   // select_cc i32, 0,   i32, i32, cc_supported
936   //
937 
938   // Try to move the zero value to the RHS
939   if (isZero(LHS)) {
940     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
941     // Try swapping the operands
942     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
943     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
944       std::swap(LHS, RHS);
945       CC = DAG.getCondCode(CCSwapped);
946     } else {
947       // Try inverting the conditon and then swapping the operands
948       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
949       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
950       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
951         std::swap(True, False);
952         std::swap(LHS, RHS);
953         CC = DAG.getCondCode(CCSwapped);
954       }
955     }
956   }
957   if (isZero(RHS)) {
958     SDValue Cond = LHS;
959     SDValue Zero = RHS;
960     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
961     if (CompareVT != VT) {
962       // Bitcast True / False to the correct types.  This will end up being
963       // a nop, but it allows us to define only a single pattern in the
964       // .TD files for each CND* instruction rather than having to have
965       // one pattern for integer True/False and one for fp True/False
966       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
967       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
968     }
969 
970     switch (CCOpcode) {
971     case ISD::SETONE:
972     case ISD::SETUNE:
973     case ISD::SETNE:
974       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
975       Temp = True;
976       True = False;
977       False = Temp;
978       break;
979     default:
980       break;
981     }
982     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
983         Cond, Zero,
984         True, False,
985         DAG.getCondCode(CCOpcode));
986     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
987   }
988 
989   // If we make it this for it means we have no native instructions to handle
990   // this SELECT_CC, so we must lower it.
991   SDValue HWTrue, HWFalse;
992 
993   if (CompareVT == MVT::f32) {
994     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
995     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
996   } else if (CompareVT == MVT::i32) {
997     HWTrue = DAG.getConstant(-1, DL, CompareVT);
998     HWFalse = DAG.getConstant(0, DL, CompareVT);
999   }
1000   else {
1001     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1002   }
1003 
1004   // Lower this unsupported SELECT_CC into a combination of two supported
1005   // SELECT_CC operations.
1006   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1007 
1008   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1009       Cond, HWFalse,
1010       True, False,
1011       DAG.getCondCode(ISD::SETNE));
1012 }
1013 
1014 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1015 /// convert these pointers to a register index.  Each register holds
1016 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1017 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1018 /// for indirect addressing.
1019 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1020                                                unsigned StackWidth,
1021                                                SelectionDAG &DAG) const {
1022   unsigned SRLPad;
1023   switch(StackWidth) {
1024   case 1:
1025     SRLPad = 2;
1026     break;
1027   case 2:
1028     SRLPad = 3;
1029     break;
1030   case 4:
1031     SRLPad = 4;
1032     break;
1033   default: llvm_unreachable("Invalid stack width");
1034   }
1035 
1036   SDLoc DL(Ptr);
1037   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1038                      DAG.getConstant(SRLPad, DL, MVT::i32));
1039 }
1040 
1041 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1042                                          unsigned ElemIdx,
1043                                          unsigned &Channel,
1044                                          unsigned &PtrIncr) const {
1045   switch (StackWidth) {
1046   default:
1047   case 1:
1048     Channel = 0;
1049     if (ElemIdx > 0) {
1050       PtrIncr = 1;
1051     } else {
1052       PtrIncr = 0;
1053     }
1054     break;
1055   case 2:
1056     Channel = ElemIdx % 2;
1057     if (ElemIdx == 2) {
1058       PtrIncr = 1;
1059     } else {
1060       PtrIncr = 0;
1061     }
1062     break;
1063   case 4:
1064     Channel = ElemIdx;
1065     PtrIncr = 0;
1066     break;
1067   }
1068 }
1069 
1070 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
1071                                                    SelectionDAG &DAG) const {
1072   SDLoc DL(Store);
1073 
1074   unsigned Mask = 0;
1075   if (Store->getMemoryVT() == MVT::i8) {
1076     Mask = 0xff;
1077   } else if (Store->getMemoryVT() == MVT::i16) {
1078     Mask = 0xffff;
1079   }
1080 
1081   SDValue Chain = Store->getChain();
1082   SDValue BasePtr = Store->getBasePtr();
1083   EVT MemVT = Store->getMemoryVT();
1084 
1085   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
1086                             DAG.getConstant(2, DL, MVT::i32));
1087   SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
1088                             Chain, Ptr,
1089                             DAG.getTargetConstant(0, DL, MVT::i32));
1090 
1091   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
1092                                 DAG.getConstant(0x3, DL, MVT::i32));
1093 
1094   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1095                                  DAG.getConstant(3, DL, MVT::i32));
1096 
1097   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1098                                   Store->getValue());
1099 
1100   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
1101 
1102   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1103                                      MaskedValue, ShiftAmt);
1104 
1105   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
1106                                 DAG.getConstant(Mask, DL, MVT::i32),
1107                                 ShiftAmt);
1108   DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
1109                         DAG.getConstant(0xffffffff, DL, MVT::i32));
1110   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1111 
1112   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1113   return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1114                      Chain, Value, Ptr,
1115                      DAG.getTargetConstant(0, DL, MVT::i32));
1116 }
1117 
1118 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1119   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1120   unsigned AS = StoreNode->getAddressSpace();
1121   SDValue Value = StoreNode->getValue();
1122   EVT ValueVT = Value.getValueType();
1123   EVT MemVT = StoreNode->getMemoryVT();
1124   unsigned Align = StoreNode->getAlignment();
1125 
1126   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
1127       ValueVT.isVector()) {
1128     return SplitVectorStore(Op, DAG);
1129   }
1130 
1131   // Private AS needs special fixes
1132   if (Align < MemVT.getStoreSize() && (AS != AMDGPUAS::PRIVATE_ADDRESS) &&
1133       !allowsMisalignedMemoryAccesses(MemVT, AS, Align, NULL)) {
1134     return expandUnalignedStore(StoreNode, DAG);
1135   }
1136 
1137   SDLoc DL(Op);
1138   SDValue Chain = StoreNode->getChain();
1139   SDValue Ptr = StoreNode->getBasePtr();
1140 
1141   if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
1142     // It is beneficial to create MSKOR here instead of combiner to avoid
1143     // artificial dependencies introduced by RMW
1144     if (StoreNode->isTruncatingStore()) {
1145       EVT VT = Value.getValueType();
1146       assert(VT.bitsLE(MVT::i32));
1147       SDValue MaskConstant;
1148       if (MemVT == MVT::i8) {
1149         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1150       } else {
1151         assert(MemVT == MVT::i16);
1152         assert(StoreNode->getAlignment() >= 2);
1153         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1154       }
1155       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1156                                       DAG.getConstant(2, DL, MVT::i32));
1157       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1158                                       DAG.getConstant(0x00000003, DL, VT));
1159       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1160       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1161                                    DAG.getConstant(3, DL, VT));
1162       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1163       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1164       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1165       // vector instead.
1166       SDValue Src[4] = {
1167         ShiftedValue,
1168         DAG.getConstant(0, DL, MVT::i32),
1169         DAG.getConstant(0, DL, MVT::i32),
1170         Mask
1171       };
1172       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
1173       SDValue Args[3] = { Chain, Input, DWordAddr };
1174       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1175                                      Op->getVTList(), Args, MemVT,
1176                                      StoreNode->getMemOperand());
1177     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1178                ValueVT.bitsGE(MVT::i32)) {
1179       // Convert pointer from byte address to dword address.
1180       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1181                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1182                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
1183 
1184       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1185         llvm_unreachable("Truncated and indexed stores not supported yet");
1186       } else {
1187         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1188       }
1189       return Chain;
1190     }
1191   }
1192 
1193   if (AS != AMDGPUAS::PRIVATE_ADDRESS)
1194     return SDValue();
1195 
1196   if (MemVT.bitsLT(MVT::i32))
1197     return lowerPrivateTruncStore(StoreNode, DAG);
1198 
1199   // Lowering for indirect addressing
1200   const MachineFunction &MF = DAG.getMachineFunction();
1201   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1202   unsigned StackWidth = TFL->getStackWidth(MF);
1203 
1204   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1205 
1206   if (ValueVT.isVector()) {
1207     unsigned NumElemVT = ValueVT.getVectorNumElements();
1208     EVT ElemVT = ValueVT.getVectorElementType();
1209     SmallVector<SDValue, 4> Stores(NumElemVT);
1210 
1211     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1212                                       "vector width in load");
1213 
1214     for (unsigned i = 0; i < NumElemVT; ++i) {
1215       unsigned Channel, PtrIncr;
1216       getStackAddress(StackWidth, i, Channel, PtrIncr);
1217       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1218                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1219       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1220                                  Value, DAG.getConstant(i, DL, MVT::i32));
1221 
1222       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1223                               Chain, Elem, Ptr,
1224                               DAG.getTargetConstant(Channel, DL, MVT::i32));
1225     }
1226      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1227    } else {
1228     if (ValueVT == MVT::i8) {
1229       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1230     }
1231     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1232     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1233   }
1234 
1235   return Chain;
1236 }
1237 
1238 // return (512 + (kc_bank << 12)
1239 static int
1240 ConstantAddressBlock(unsigned AddressSpace) {
1241   switch (AddressSpace) {
1242   case AMDGPUAS::CONSTANT_BUFFER_0:
1243     return 512;
1244   case AMDGPUAS::CONSTANT_BUFFER_1:
1245     return 512 + 4096;
1246   case AMDGPUAS::CONSTANT_BUFFER_2:
1247     return 512 + 4096 * 2;
1248   case AMDGPUAS::CONSTANT_BUFFER_3:
1249     return 512 + 4096 * 3;
1250   case AMDGPUAS::CONSTANT_BUFFER_4:
1251     return 512 + 4096 * 4;
1252   case AMDGPUAS::CONSTANT_BUFFER_5:
1253     return 512 + 4096 * 5;
1254   case AMDGPUAS::CONSTANT_BUFFER_6:
1255     return 512 + 4096 * 6;
1256   case AMDGPUAS::CONSTANT_BUFFER_7:
1257     return 512 + 4096 * 7;
1258   case AMDGPUAS::CONSTANT_BUFFER_8:
1259     return 512 + 4096 * 8;
1260   case AMDGPUAS::CONSTANT_BUFFER_9:
1261     return 512 + 4096 * 9;
1262   case AMDGPUAS::CONSTANT_BUFFER_10:
1263     return 512 + 4096 * 10;
1264   case AMDGPUAS::CONSTANT_BUFFER_11:
1265     return 512 + 4096 * 11;
1266   case AMDGPUAS::CONSTANT_BUFFER_12:
1267     return 512 + 4096 * 12;
1268   case AMDGPUAS::CONSTANT_BUFFER_13:
1269     return 512 + 4096 * 13;
1270   case AMDGPUAS::CONSTANT_BUFFER_14:
1271     return 512 + 4096 * 14;
1272   case AMDGPUAS::CONSTANT_BUFFER_15:
1273     return 512 + 4096 * 15;
1274   default:
1275     return -1;
1276   }
1277 }
1278 
1279 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
1280                                                 SelectionDAG &DAG) const {
1281   SDLoc DL(Op);
1282   LoadSDNode *Load = cast<LoadSDNode>(Op);
1283   ISD::LoadExtType ExtType = Load->getExtensionType();
1284   EVT MemVT = Load->getMemoryVT();
1285 
1286   // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
1287   // register (2-)byte extract.
1288 
1289   // Get Register holding the target.
1290   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
1291                             DAG.getConstant(2, DL, MVT::i32));
1292   // Load the Register.
1293   SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
1294                             Load->getChain(),
1295                             Ptr,
1296                             DAG.getTargetConstant(0, DL, MVT::i32),
1297                             Op.getOperand(2));
1298 
1299   // Get offset within the register.
1300   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1301                                 Load->getBasePtr(),
1302                                 DAG.getConstant(0x3, DL, MVT::i32));
1303 
1304   // Bit offset of target byte (byteIdx * 8).
1305   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1306                                  DAG.getConstant(3, DL, MVT::i32));
1307 
1308   // Shift to the right.
1309   Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
1310 
1311   // Eliminate the upper bits by setting them to ...
1312   EVT MemEltVT = MemVT.getScalarType();
1313 
1314   // ... ones.
1315   if (ExtType == ISD::SEXTLOAD) {
1316     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1317 
1318     SDValue Ops[] = {
1319       DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
1320       Load->getChain()
1321     };
1322 
1323     return DAG.getMergeValues(Ops, DL);
1324   }
1325 
1326   // ... or zeros.
1327   SDValue Ops[] = {
1328     DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
1329     Load->getChain()
1330   };
1331 
1332   return DAG.getMergeValues(Ops, DL);
1333 }
1334 
1335 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1336   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1337   unsigned AS = LoadNode->getAddressSpace();
1338   EVT MemVT = LoadNode->getMemoryVT();
1339   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
1340 
1341   if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
1342       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
1343     return lowerPrivateExtLoad(Op, DAG);
1344   }
1345 
1346   SDLoc DL(Op);
1347   EVT VT = Op.getValueType();
1348   SDValue Chain = LoadNode->getChain();
1349   SDValue Ptr = LoadNode->getBasePtr();
1350 
1351   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1352     SDValue MergedValues[2] = {
1353       scalarizeVectorLoad(LoadNode, DAG),
1354       Chain
1355     };
1356     return DAG.getMergeValues(MergedValues, DL);
1357   }
1358 
1359   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1360   if (ConstantBlock > -1 &&
1361       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1362        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1363     SDValue Result;
1364     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1365         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1366         isa<ConstantSDNode>(Ptr)) {
1367       SDValue Slots[4];
1368       for (unsigned i = 0; i < 4; i++) {
1369         // We want Const position encoded with the following formula :
1370         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1371         // const_index is Ptr computed by llvm using an alignment of 16.
1372         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1373         // then div by 4 at the ISel step
1374         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1375             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1376         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1377       }
1378       EVT NewVT = MVT::v4i32;
1379       unsigned NumElements = 4;
1380       if (VT.isVector()) {
1381         NewVT = VT;
1382         NumElements = VT.getVectorNumElements();
1383       }
1384       Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
1385     } else {
1386       // non-constant ptr can't be folded, keeps it as a v4f32 load
1387       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1388           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1389                       DAG.getConstant(4, DL, MVT::i32)),
1390                       DAG.getConstant(LoadNode->getAddressSpace() -
1391                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1392           );
1393     }
1394 
1395     if (!VT.isVector()) {
1396       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1397                            DAG.getConstant(0, DL, MVT::i32));
1398     }
1399 
1400     SDValue MergedValues[2] = {
1401       Result,
1402       Chain
1403     };
1404     return DAG.getMergeValues(MergedValues, DL);
1405   }
1406 
1407   SDValue LoweredLoad;
1408 
1409   // For most operations returning SDValue() will result in the node being
1410   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1411   // need to manually expand loads that may be legal in some address spaces and
1412   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1413   // compute shaders, since the data is sign extended when it is uploaded to the
1414   // buffer. However SEXT loads from other address spaces are not supported, so
1415   // we need to expand them here.
1416   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1417     EVT MemVT = LoadNode->getMemoryVT();
1418     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1419     SDValue NewLoad = DAG.getExtLoad(
1420         ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
1421         LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
1422     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1423                               DAG.getValueType(MemVT));
1424 
1425     SDValue MergedValues[2] = { Res, Chain };
1426     return DAG.getMergeValues(MergedValues, DL);
1427   }
1428 
1429   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1430     return SDValue();
1431   }
1432 
1433   // Lowering for indirect addressing
1434   const MachineFunction &MF = DAG.getMachineFunction();
1435   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1436   unsigned StackWidth = TFL->getStackWidth(MF);
1437 
1438   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1439 
1440   if (VT.isVector()) {
1441     unsigned NumElemVT = VT.getVectorNumElements();
1442     EVT ElemVT = VT.getVectorElementType();
1443     SDValue Loads[4];
1444 
1445     assert(NumElemVT <= 4);
1446     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1447                                       "vector width in load");
1448 
1449     for (unsigned i = 0; i < NumElemVT; ++i) {
1450       unsigned Channel, PtrIncr;
1451       getStackAddress(StackWidth, i, Channel, PtrIncr);
1452       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1453                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1454       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1455                              Chain, Ptr,
1456                              DAG.getTargetConstant(Channel, DL, MVT::i32),
1457                              Op.getOperand(2));
1458     }
1459     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
1460     LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
1461   } else {
1462     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1463                               Chain, Ptr,
1464                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1465                               Op.getOperand(2));
1466   }
1467 
1468   SDValue Ops[2] = {
1469     LoweredLoad,
1470     Chain
1471   };
1472 
1473   return DAG.getMergeValues(Ops, DL);
1474 }
1475 
1476 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1477   SDValue Chain = Op.getOperand(0);
1478   SDValue Cond  = Op.getOperand(1);
1479   SDValue Jump  = Op.getOperand(2);
1480 
1481   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1482                      Chain, Jump, Cond);
1483 }
1484 
1485 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
1486                                             SelectionDAG &DAG) const {
1487   MachineFunction &MF = DAG.getMachineFunction();
1488   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1489 
1490   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
1491 
1492   unsigned FrameIndex = FIN->getIndex();
1493   unsigned IgnoredFrameReg;
1494   unsigned Offset =
1495     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
1496   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
1497                          Op.getValueType());
1498 }
1499 
1500 /// XXX Only kernel functions are supported, so we can assume for now that
1501 /// every function is a kernel function, but in the future we should use
1502 /// separate calling conventions for kernel and non-kernel functions.
1503 SDValue R600TargetLowering::LowerFormalArguments(
1504     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1505     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1506     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1507   SmallVector<CCValAssign, 16> ArgLocs;
1508   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1509                  *DAG.getContext());
1510   MachineFunction &MF = DAG.getMachineFunction();
1511   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1512 
1513   SmallVector<ISD::InputArg, 8> LocalIns;
1514 
1515   if (AMDGPU::isShader(CallConv)) {
1516     AnalyzeFormalArguments(CCInfo, Ins);
1517   } else {
1518     analyzeFormalArgumentsCompute(CCInfo, Ins);
1519   }
1520 
1521   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1522     CCValAssign &VA = ArgLocs[i];
1523     const ISD::InputArg &In = Ins[i];
1524     EVT VT = In.VT;
1525     EVT MemVT = VA.getLocVT();
1526     if (!VT.isVector() && MemVT.isVector()) {
1527       // Get load source type if scalarized.
1528       MemVT = MemVT.getVectorElementType();
1529     }
1530 
1531     if (AMDGPU::isShader(CallConv)) {
1532       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1533       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1534       InVals.push_back(Register);
1535       continue;
1536     }
1537 
1538     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1539                                           AMDGPUAS::CONSTANT_BUFFER_0);
1540 
1541     // i64 isn't a legal type, so the register type used ends up as i32, which
1542     // isn't expected here. It attempts to create this sextload, but it ends up
1543     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1544     // for <1 x i64>.
1545 
1546     // The first 36 bytes of the input buffer contains information about
1547     // thread group and global sizes.
1548     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1549     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1550       // FIXME: This should really check the extload type, but the handling of
1551       // extload vector parameters seems to be broken.
1552 
1553       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1554       Ext = ISD::SEXTLOAD;
1555     }
1556 
1557     // Compute the offset from the value.
1558     // XXX - I think PartOffset should give you this, but it seems to give the
1559     // size of the register which isn't useful.
1560 
1561     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1562     unsigned PartOffset = VA.getLocMemOffset();
1563     unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset();
1564 
1565     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1566     SDValue Arg = DAG.getLoad(
1567         ISD::UNINDEXED, Ext, VT, DL, Chain,
1568         DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
1569         MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
1570                                         MachineMemOperand::MODereferenceable |
1571                                         MachineMemOperand::MOInvariant);
1572 
1573     // 4 is the preferred alignment for the CONSTANT memory space.
1574     InVals.push_back(Arg);
1575     MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
1576   }
1577   return Chain;
1578 }
1579 
1580 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1581                                            EVT VT) const {
1582    if (!VT.isVector())
1583      return MVT::i32;
1584    return VT.changeVectorElementTypeToInteger();
1585 }
1586 
1587 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1588                                                         unsigned AddrSpace,
1589                                                         unsigned Align,
1590                                                         bool *IsFast) const {
1591   if (IsFast)
1592     *IsFast = false;
1593 
1594   if (!VT.isSimple() || VT == MVT::Other)
1595     return false;
1596 
1597   if (VT.bitsLT(MVT::i32))
1598     return false;
1599 
1600   // TODO: This is a rough estimate.
1601   if (IsFast)
1602     *IsFast = true;
1603 
1604   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1605 }
1606 
1607 static SDValue CompactSwizzlableVector(
1608   SelectionDAG &DAG, SDValue VectorEntry,
1609   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1610   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1611   assert(RemapSwizzle.empty());
1612   SDValue NewBldVec[4] = {
1613     VectorEntry.getOperand(0),
1614     VectorEntry.getOperand(1),
1615     VectorEntry.getOperand(2),
1616     VectorEntry.getOperand(3)
1617   };
1618 
1619   for (unsigned i = 0; i < 4; i++) {
1620     if (NewBldVec[i].isUndef())
1621       // We mask write here to teach later passes that the ith element of this
1622       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1623       // break false dependencies and additionnaly make assembly easier to read.
1624       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1625     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1626       if (C->isZero()) {
1627         RemapSwizzle[i] = 4; // SEL_0
1628         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1629       } else if (C->isExactlyValue(1.0)) {
1630         RemapSwizzle[i] = 5; // SEL_1
1631         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1632       }
1633     }
1634 
1635     if (NewBldVec[i].isUndef())
1636       continue;
1637     for (unsigned j = 0; j < i; j++) {
1638       if (NewBldVec[i] == NewBldVec[j]) {
1639         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1640         RemapSwizzle[i] = j;
1641         break;
1642       }
1643     }
1644   }
1645 
1646   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1647                             NewBldVec);
1648 }
1649 
1650 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1651                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1652   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1653   assert(RemapSwizzle.empty());
1654   SDValue NewBldVec[4] = {
1655       VectorEntry.getOperand(0),
1656       VectorEntry.getOperand(1),
1657       VectorEntry.getOperand(2),
1658       VectorEntry.getOperand(3)
1659   };
1660   bool isUnmovable[4] = { false, false, false, false };
1661   for (unsigned i = 0; i < 4; i++) {
1662     RemapSwizzle[i] = i;
1663     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1664       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1665           ->getZExtValue();
1666       if (i == Idx)
1667         isUnmovable[Idx] = true;
1668     }
1669   }
1670 
1671   for (unsigned i = 0; i < 4; i++) {
1672     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1673       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1674           ->getZExtValue();
1675       if (isUnmovable[Idx])
1676         continue;
1677       // Swap i and Idx
1678       std::swap(NewBldVec[Idx], NewBldVec[i]);
1679       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1680       break;
1681     }
1682   }
1683 
1684   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1685                             NewBldVec);
1686 }
1687 
1688 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
1689                                             SelectionDAG &DAG,
1690                                             const SDLoc &DL) const {
1691   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1692   // Old -> New swizzle values
1693   DenseMap<unsigned, unsigned> SwizzleRemap;
1694 
1695   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1696   for (unsigned i = 0; i < 4; i++) {
1697     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1698     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1699       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1700   }
1701 
1702   SwizzleRemap.clear();
1703   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1704   for (unsigned i = 0; i < 4; i++) {
1705     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1706     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1707       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1708   }
1709 
1710   return BuildVector;
1711 }
1712 
1713 
1714 //===----------------------------------------------------------------------===//
1715 // Custom DAG Optimizations
1716 //===----------------------------------------------------------------------===//
1717 
1718 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1719                                               DAGCombinerInfo &DCI) const {
1720   SelectionDAG &DAG = DCI.DAG;
1721   SDLoc DL(N);
1722 
1723   switch (N->getOpcode()) {
1724   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1725   case ISD::FP_ROUND: {
1726       SDValue Arg = N->getOperand(0);
1727       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1728         return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0),
1729                            Arg.getOperand(0));
1730       }
1731       break;
1732     }
1733 
1734   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1735   // (i32 select_cc f32, f32, -1, 0 cc)
1736   //
1737   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1738   // this to one of the SET*_DX10 instructions.
1739   case ISD::FP_TO_SINT: {
1740     SDValue FNeg = N->getOperand(0);
1741     if (FNeg.getOpcode() != ISD::FNEG) {
1742       return SDValue();
1743     }
1744     SDValue SelectCC = FNeg.getOperand(0);
1745     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1746         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1747         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1748         !isHWTrueValue(SelectCC.getOperand(2)) ||
1749         !isHWFalseValue(SelectCC.getOperand(3))) {
1750       return SDValue();
1751     }
1752 
1753     return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0),
1754                            SelectCC.getOperand(0), // LHS
1755                            SelectCC.getOperand(1), // RHS
1756                            DAG.getConstant(-1, DL, MVT::i32), // True
1757                            DAG.getConstant(0, DL, MVT::i32),  // False
1758                            SelectCC.getOperand(4)); // CC
1759 
1760     break;
1761   }
1762 
1763   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1764   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1765   case ISD::INSERT_VECTOR_ELT: {
1766     SDValue InVec = N->getOperand(0);
1767     SDValue InVal = N->getOperand(1);
1768     SDValue EltNo = N->getOperand(2);
1769 
1770     // If the inserted element is an UNDEF, just use the input vector.
1771     if (InVal.isUndef())
1772       return InVec;
1773 
1774     EVT VT = InVec.getValueType();
1775 
1776     // If we can't generate a legal BUILD_VECTOR, exit
1777     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1778       return SDValue();
1779 
1780     // Check that we know which element is being inserted
1781     if (!isa<ConstantSDNode>(EltNo))
1782       return SDValue();
1783     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1784 
1785     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1786     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1787     // vector elements.
1788     SmallVector<SDValue, 8> Ops;
1789     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1790       Ops.append(InVec.getNode()->op_begin(),
1791                  InVec.getNode()->op_end());
1792     } else if (InVec.isUndef()) {
1793       unsigned NElts = VT.getVectorNumElements();
1794       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1795     } else {
1796       return SDValue();
1797     }
1798 
1799     // Insert the element
1800     if (Elt < Ops.size()) {
1801       // All the operands of BUILD_VECTOR must have the same type;
1802       // we enforce that here.
1803       EVT OpVT = Ops[0].getValueType();
1804       if (InVal.getValueType() != OpVT)
1805         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1806           DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
1807           DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
1808       Ops[Elt] = InVal;
1809     }
1810 
1811     // Return the new vector
1812     return DAG.getBuildVector(VT, DL, Ops);
1813   }
1814 
1815   // Extract_vec (Build_vector) generated by custom lowering
1816   // also needs to be customly combined
1817   case ISD::EXTRACT_VECTOR_ELT: {
1818     SDValue Arg = N->getOperand(0);
1819     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1820       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1821         unsigned Element = Const->getZExtValue();
1822         return Arg->getOperand(Element);
1823       }
1824     }
1825     if (Arg.getOpcode() == ISD::BITCAST &&
1826         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
1827         (Arg.getOperand(0).getValueType().getVectorNumElements() ==
1828          Arg.getValueType().getVectorNumElements())) {
1829       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1830         unsigned Element = Const->getZExtValue();
1831         return DAG.getNode(ISD::BITCAST, DL, N->getVTList(),
1832                            Arg->getOperand(0).getOperand(Element));
1833       }
1834     }
1835     break;
1836   }
1837 
1838   case ISD::SELECT_CC: {
1839     // Try common optimizations
1840     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
1841       return Ret;
1842 
1843     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1844     //      selectcc x, y, a, b, inv(cc)
1845     //
1846     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1847     //      selectcc x, y, a, b, cc
1848     SDValue LHS = N->getOperand(0);
1849     if (LHS.getOpcode() != ISD::SELECT_CC) {
1850       return SDValue();
1851     }
1852 
1853     SDValue RHS = N->getOperand(1);
1854     SDValue True = N->getOperand(2);
1855     SDValue False = N->getOperand(3);
1856     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1857 
1858     if (LHS.getOperand(2).getNode() != True.getNode() ||
1859         LHS.getOperand(3).getNode() != False.getNode() ||
1860         RHS.getNode() != False.getNode()) {
1861       return SDValue();
1862     }
1863 
1864     switch (NCC) {
1865     default: return SDValue();
1866     case ISD::SETNE: return LHS;
1867     case ISD::SETEQ: {
1868       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1869       LHSCC = ISD::getSetCCInverse(LHSCC,
1870                                   LHS.getOperand(0).getValueType().isInteger());
1871       if (DCI.isBeforeLegalizeOps() ||
1872           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1873         return DAG.getSelectCC(DL,
1874                                LHS.getOperand(0),
1875                                LHS.getOperand(1),
1876                                LHS.getOperand(2),
1877                                LHS.getOperand(3),
1878                                LHSCC);
1879       break;
1880     }
1881     }
1882     return SDValue();
1883   }
1884 
1885   case AMDGPUISD::EXPORT: {
1886     SDValue Arg = N->getOperand(1);
1887     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1888       break;
1889 
1890     SDValue NewArgs[8] = {
1891       N->getOperand(0), // Chain
1892       SDValue(),
1893       N->getOperand(2), // ArrayBase
1894       N->getOperand(3), // Type
1895       N->getOperand(4), // SWZ_X
1896       N->getOperand(5), // SWZ_Y
1897       N->getOperand(6), // SWZ_Z
1898       N->getOperand(7) // SWZ_W
1899     };
1900     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
1901     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
1902   }
1903   case AMDGPUISD::TEXTURE_FETCH: {
1904     SDValue Arg = N->getOperand(1);
1905     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1906       break;
1907 
1908     SDValue NewArgs[19] = {
1909       N->getOperand(0),
1910       N->getOperand(1),
1911       N->getOperand(2),
1912       N->getOperand(3),
1913       N->getOperand(4),
1914       N->getOperand(5),
1915       N->getOperand(6),
1916       N->getOperand(7),
1917       N->getOperand(8),
1918       N->getOperand(9),
1919       N->getOperand(10),
1920       N->getOperand(11),
1921       N->getOperand(12),
1922       N->getOperand(13),
1923       N->getOperand(14),
1924       N->getOperand(15),
1925       N->getOperand(16),
1926       N->getOperand(17),
1927       N->getOperand(18),
1928     };
1929     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
1930     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
1931   }
1932   default: break;
1933   }
1934 
1935   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1936 }
1937 
1938 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
1939                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
1940                                      SDValue &Sel, SDValue &Imm,
1941                                      SelectionDAG &DAG) const {
1942   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
1943   if (!Src.isMachineOpcode())
1944     return false;
1945 
1946   switch (Src.getMachineOpcode()) {
1947   case AMDGPU::FNEG_R600:
1948     if (!Neg.getNode())
1949       return false;
1950     Src = Src.getOperand(0);
1951     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
1952     return true;
1953   case AMDGPU::FABS_R600:
1954     if (!Abs.getNode())
1955       return false;
1956     Src = Src.getOperand(0);
1957     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
1958     return true;
1959   case AMDGPU::CONST_COPY: {
1960     unsigned Opcode = ParentNode->getMachineOpcode();
1961     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1962 
1963     if (!Sel.getNode())
1964       return false;
1965 
1966     SDValue CstOffset = Src.getOperand(0);
1967     if (ParentNode->getValueType(0).isVector())
1968       return false;
1969 
1970     // Gather constants values
1971     int SrcIndices[] = {
1972       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1973       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1974       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1975       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1976       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1977       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1978       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1979       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1980       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1981       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1982       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1983     };
1984     std::vector<unsigned> Consts;
1985     for (int OtherSrcIdx : SrcIndices) {
1986       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1987       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1988         continue;
1989       if (HasDst) {
1990         OtherSrcIdx--;
1991         OtherSelIdx--;
1992       }
1993       if (RegisterSDNode *Reg =
1994           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1995         if (Reg->getReg() == AMDGPU::ALU_CONST) {
1996           ConstantSDNode *Cst
1997             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
1998           Consts.push_back(Cst->getZExtValue());
1999         }
2000       }
2001     }
2002 
2003     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2004     Consts.push_back(Cst->getZExtValue());
2005     if (!TII->fitsConstReadLimitations(Consts)) {
2006       return false;
2007     }
2008 
2009     Sel = CstOffset;
2010     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2011     return true;
2012   }
2013   case AMDGPU::MOV_IMM_GLOBAL_ADDR:
2014     // Check if the Imm slot is used. Taken from below.
2015     if (cast<ConstantSDNode>(Imm)->getZExtValue())
2016       return false;
2017     Imm = Src.getOperand(0);
2018     Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
2019     return true;
2020   case AMDGPU::MOV_IMM_I32:
2021   case AMDGPU::MOV_IMM_F32: {
2022     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2023     uint64_t ImmValue = 0;
2024 
2025 
2026     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2027       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2028       float FloatValue = FPC->getValueAPF().convertToFloat();
2029       if (FloatValue == 0.0) {
2030         ImmReg = AMDGPU::ZERO;
2031       } else if (FloatValue == 0.5) {
2032         ImmReg = AMDGPU::HALF;
2033       } else if (FloatValue == 1.0) {
2034         ImmReg = AMDGPU::ONE;
2035       } else {
2036         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2037       }
2038     } else {
2039       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2040       uint64_t Value = C->getZExtValue();
2041       if (Value == 0) {
2042         ImmReg = AMDGPU::ZERO;
2043       } else if (Value == 1) {
2044         ImmReg = AMDGPU::ONE_INT;
2045       } else {
2046         ImmValue = Value;
2047       }
2048     }
2049 
2050     // Check that we aren't already using an immediate.
2051     // XXX: It's possible for an instruction to have more than one
2052     // immediate operand, but this is not supported yet.
2053     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2054       if (!Imm.getNode())
2055         return false;
2056       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2057       assert(C);
2058       if (C->getZExtValue())
2059         return false;
2060       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2061     }
2062     Src = DAG.getRegister(ImmReg, MVT::i32);
2063     return true;
2064   }
2065   default:
2066     return false;
2067   }
2068 }
2069 
2070 /// \brief Fold the instructions after selecting them
2071 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2072                                             SelectionDAG &DAG) const {
2073   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2074   if (!Node->isMachineOpcode())
2075     return Node;
2076 
2077   unsigned Opcode = Node->getMachineOpcode();
2078   SDValue FakeOp;
2079 
2080   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2081 
2082   if (Opcode == AMDGPU::DOT_4) {
2083     int OperandIdx[] = {
2084       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2085       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2086       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2087       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2088       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2089       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2090       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2091       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2092         };
2093     int NegIdx[] = {
2094       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2095       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2096       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2097       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2098       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2099       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2100       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2101       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2102     };
2103     int AbsIdx[] = {
2104       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2105       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2106       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2107       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2108       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2109       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2110       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2111       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2112     };
2113     for (unsigned i = 0; i < 8; i++) {
2114       if (OperandIdx[i] < 0)
2115         return Node;
2116       SDValue &Src = Ops[OperandIdx[i] - 1];
2117       SDValue &Neg = Ops[NegIdx[i] - 1];
2118       SDValue &Abs = Ops[AbsIdx[i] - 1];
2119       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2120       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2121       if (HasDst)
2122         SelIdx--;
2123       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2124       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2125         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2126     }
2127   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2128     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2129       SDValue &Src = Ops[i];
2130       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2131         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2132     }
2133   } else if (Opcode == AMDGPU::CLAMP_R600) {
2134     SDValue Src = Node->getOperand(0);
2135     if (!Src.isMachineOpcode() ||
2136         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2137       return Node;
2138     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2139         AMDGPU::OpName::clamp);
2140     if (ClampIdx < 0)
2141       return Node;
2142     SDLoc DL(Node);
2143     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2144     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2145     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2146                               Node->getVTList(), Ops);
2147   } else {
2148     if (!TII->hasInstrModifiers(Opcode))
2149       return Node;
2150     int OperandIdx[] = {
2151       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2152       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2153       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2154     };
2155     int NegIdx[] = {
2156       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2157       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2158       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2159     };
2160     int AbsIdx[] = {
2161       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2162       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2163       -1
2164     };
2165     for (unsigned i = 0; i < 3; i++) {
2166       if (OperandIdx[i] < 0)
2167         return Node;
2168       SDValue &Src = Ops[OperandIdx[i] - 1];
2169       SDValue &Neg = Ops[NegIdx[i] - 1];
2170       SDValue FakeAbs;
2171       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2172       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2173       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2174       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2175       if (HasDst) {
2176         SelIdx--;
2177         ImmIdx--;
2178       }
2179       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2180       SDValue &Imm = Ops[ImmIdx];
2181       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2182         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2183     }
2184   }
2185 
2186   return Node;
2187 }
2188