1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for R600
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "R600ISelLowering.h"
16 #include "AMDGPUFrameLowering.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "R600Defines.h"
20 #include "R600InstrInfo.h"
21 #include "R600MachineFunctionInfo.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/CallingConvLower.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/IR/Argument.h"
29 #include "llvm/IR/Function.h"
30 
31 using namespace llvm;
32 
33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
34                                        const AMDGPUSubtarget &STI)
35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
42 
43   computeRegisterProperties(STI.getRegisterInfo());
44 
45   // Set condition code actions
46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
58 
59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
63 
64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
66 
67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
69 
70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
73 
74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
75 
76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
79 
80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
82 
83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
88 
89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
93 
94   // ADD, SUB overflow.
95   // TODO: turn these into Legal?
96   if (Subtarget->hasCARRY())
97     setOperationAction(ISD::UADDO, MVT::i32, Custom);
98 
99   if (Subtarget->hasBORROW())
100     setOperationAction(ISD::USUBO, MVT::i32, Custom);
101 
102   // Expand sign extension of vectors
103   if (!Subtarget->hasBFE())
104     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
105 
106   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
107   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
108 
109   if (!Subtarget->hasBFE())
110     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
113 
114   if (!Subtarget->hasBFE())
115     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
116   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
117   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
118 
119   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
120   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
121   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
122 
123   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
124 
125 
126   // Legalize loads and stores to the private address space.
127   setOperationAction(ISD::LOAD, MVT::i32, Custom);
128   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
129   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
130 
131   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
132   // spaces, so it is custom lowered to handle those where it isn't.
133   for (MVT VT : MVT::integer_valuetypes()) {
134     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
135     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
136     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
137 
138     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
139     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
140     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
141 
142     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
143     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
144     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
145   }
146 
147   setOperationAction(ISD::STORE, MVT::i8, Custom);
148   setOperationAction(ISD::STORE, MVT::i32, Custom);
149   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
150   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
151   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
152   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
153 
154   setOperationAction(ISD::LOAD, MVT::i32, Custom);
155   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
156   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
157 
158   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
159   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
160   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
161   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
162 
163   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
164   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
165   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
166   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
167 
168   setTargetDAGCombine(ISD::FP_ROUND);
169   setTargetDAGCombine(ISD::FP_TO_SINT);
170   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
171   setTargetDAGCombine(ISD::SELECT_CC);
172   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
173 
174   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
175   //  to be Legal/Custom in order to avoid library calls.
176   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
177   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
178   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
179 
180   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
181 
182   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
183   for (MVT VT : ScalarIntVTs) {
184     setOperationAction(ISD::ADDC, VT, Expand);
185     setOperationAction(ISD::SUBC, VT, Expand);
186     setOperationAction(ISD::ADDE, VT, Expand);
187     setOperationAction(ISD::SUBE, VT, Expand);
188   }
189 
190   setSchedulingPreference(Sched::Source);
191 }
192 
193 static inline bool isEOP(MachineBasicBlock::iterator I) {
194   return std::next(I)->getOpcode() == AMDGPU::RETURN;
195 }
196 
197 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
198     MachineInstr * MI, MachineBasicBlock * BB) const {
199   MachineFunction * MF = BB->getParent();
200   MachineRegisterInfo &MRI = MF->getRegInfo();
201   MachineBasicBlock::iterator I = *MI;
202   const R600InstrInfo *TII =
203       static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
204 
205   switch (MI->getOpcode()) {
206   default:
207     // Replace LDS_*_RET instruction that don't have any uses with the
208     // equivalent LDS_*_NORET instruction.
209     if (TII->isLDSRetInstr(MI->getOpcode())) {
210       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
211       assert(DstIdx != -1);
212       MachineInstrBuilder NewMI;
213       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
214       //        LDS_1A2D support and remove this special case.
215       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
216            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
217         return BB;
218 
219       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
220                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
221       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
222         NewMI.addOperand(MI->getOperand(i));
223       }
224     } else {
225       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
226     }
227     break;
228   case AMDGPU::CLAMP_R600: {
229     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
230                                                    AMDGPU::MOV,
231                                                    MI->getOperand(0).getReg(),
232                                                    MI->getOperand(1).getReg());
233     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
234     break;
235   }
236 
237   case AMDGPU::FABS_R600: {
238     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
239                                                     AMDGPU::MOV,
240                                                     MI->getOperand(0).getReg(),
241                                                     MI->getOperand(1).getReg());
242     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
243     break;
244   }
245 
246   case AMDGPU::FNEG_R600: {
247     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
248                                                     AMDGPU::MOV,
249                                                     MI->getOperand(0).getReg(),
250                                                     MI->getOperand(1).getReg());
251     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
252     break;
253   }
254 
255   case AMDGPU::MASK_WRITE: {
256     unsigned maskedRegister = MI->getOperand(0).getReg();
257     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
258     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
259     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
260     break;
261   }
262 
263   case AMDGPU::MOV_IMM_F32:
264     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
265                      MI->getOperand(1).getFPImm()->getValueAPF()
266                          .bitcastToAPInt().getZExtValue());
267     break;
268   case AMDGPU::MOV_IMM_I32:
269     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
270                      MI->getOperand(1).getImm());
271     break;
272   case AMDGPU::CONST_COPY: {
273     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
274         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
275     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
276         MI->getOperand(1).getImm());
277     break;
278   }
279 
280   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
281   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
282   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
283     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
284             .addOperand(MI->getOperand(0))
285             .addOperand(MI->getOperand(1))
286             .addImm(isEOP(I)); // Set End of program bit
287     break;
288   }
289   case AMDGPU::RAT_STORE_TYPED_eg: {
290     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
291             .addOperand(MI->getOperand(0))
292             .addOperand(MI->getOperand(1))
293             .addOperand(MI->getOperand(2))
294             .addImm(isEOP(I)); // Set End of program bit
295     break;
296   }
297 
298   case AMDGPU::TXD: {
299     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
300     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
301     MachineOperand &RID = MI->getOperand(4);
302     MachineOperand &SID = MI->getOperand(5);
303     unsigned TextureId = MI->getOperand(6).getImm();
304     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
305     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
306 
307     switch (TextureId) {
308     case 5: // Rect
309       CTX = CTY = 0;
310       break;
311     case 6: // Shadow1D
312       SrcW = SrcZ;
313       break;
314     case 7: // Shadow2D
315       SrcW = SrcZ;
316       break;
317     case 8: // ShadowRect
318       CTX = CTY = 0;
319       SrcW = SrcZ;
320       break;
321     case 9: // 1DArray
322       SrcZ = SrcY;
323       CTZ = 0;
324       break;
325     case 10: // 2DArray
326       CTZ = 0;
327       break;
328     case 11: // Shadow1DArray
329       SrcZ = SrcY;
330       CTZ = 0;
331       break;
332     case 12: // Shadow2DArray
333       CTZ = 0;
334       break;
335     }
336     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
337             .addOperand(MI->getOperand(3))
338             .addImm(SrcX)
339             .addImm(SrcY)
340             .addImm(SrcZ)
341             .addImm(SrcW)
342             .addImm(0)
343             .addImm(0)
344             .addImm(0)
345             .addImm(0)
346             .addImm(1)
347             .addImm(2)
348             .addImm(3)
349             .addOperand(RID)
350             .addOperand(SID)
351             .addImm(CTX)
352             .addImm(CTY)
353             .addImm(CTZ)
354             .addImm(CTW);
355     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
356             .addOperand(MI->getOperand(2))
357             .addImm(SrcX)
358             .addImm(SrcY)
359             .addImm(SrcZ)
360             .addImm(SrcW)
361             .addImm(0)
362             .addImm(0)
363             .addImm(0)
364             .addImm(0)
365             .addImm(1)
366             .addImm(2)
367             .addImm(3)
368             .addOperand(RID)
369             .addOperand(SID)
370             .addImm(CTX)
371             .addImm(CTY)
372             .addImm(CTZ)
373             .addImm(CTW);
374     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
375             .addOperand(MI->getOperand(0))
376             .addOperand(MI->getOperand(1))
377             .addImm(SrcX)
378             .addImm(SrcY)
379             .addImm(SrcZ)
380             .addImm(SrcW)
381             .addImm(0)
382             .addImm(0)
383             .addImm(0)
384             .addImm(0)
385             .addImm(1)
386             .addImm(2)
387             .addImm(3)
388             .addOperand(RID)
389             .addOperand(SID)
390             .addImm(CTX)
391             .addImm(CTY)
392             .addImm(CTZ)
393             .addImm(CTW)
394             .addReg(T0, RegState::Implicit)
395             .addReg(T1, RegState::Implicit);
396     break;
397   }
398 
399   case AMDGPU::TXD_SHADOW: {
400     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
401     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
402     MachineOperand &RID = MI->getOperand(4);
403     MachineOperand &SID = MI->getOperand(5);
404     unsigned TextureId = MI->getOperand(6).getImm();
405     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
406     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
407 
408     switch (TextureId) {
409     case 5: // Rect
410       CTX = CTY = 0;
411       break;
412     case 6: // Shadow1D
413       SrcW = SrcZ;
414       break;
415     case 7: // Shadow2D
416       SrcW = SrcZ;
417       break;
418     case 8: // ShadowRect
419       CTX = CTY = 0;
420       SrcW = SrcZ;
421       break;
422     case 9: // 1DArray
423       SrcZ = SrcY;
424       CTZ = 0;
425       break;
426     case 10: // 2DArray
427       CTZ = 0;
428       break;
429     case 11: // Shadow1DArray
430       SrcZ = SrcY;
431       CTZ = 0;
432       break;
433     case 12: // Shadow2DArray
434       CTZ = 0;
435       break;
436     }
437 
438     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
439             .addOperand(MI->getOperand(3))
440             .addImm(SrcX)
441             .addImm(SrcY)
442             .addImm(SrcZ)
443             .addImm(SrcW)
444             .addImm(0)
445             .addImm(0)
446             .addImm(0)
447             .addImm(0)
448             .addImm(1)
449             .addImm(2)
450             .addImm(3)
451             .addOperand(RID)
452             .addOperand(SID)
453             .addImm(CTX)
454             .addImm(CTY)
455             .addImm(CTZ)
456             .addImm(CTW);
457     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
458             .addOperand(MI->getOperand(2))
459             .addImm(SrcX)
460             .addImm(SrcY)
461             .addImm(SrcZ)
462             .addImm(SrcW)
463             .addImm(0)
464             .addImm(0)
465             .addImm(0)
466             .addImm(0)
467             .addImm(1)
468             .addImm(2)
469             .addImm(3)
470             .addOperand(RID)
471             .addOperand(SID)
472             .addImm(CTX)
473             .addImm(CTY)
474             .addImm(CTZ)
475             .addImm(CTW);
476     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
477             .addOperand(MI->getOperand(0))
478             .addOperand(MI->getOperand(1))
479             .addImm(SrcX)
480             .addImm(SrcY)
481             .addImm(SrcZ)
482             .addImm(SrcW)
483             .addImm(0)
484             .addImm(0)
485             .addImm(0)
486             .addImm(0)
487             .addImm(1)
488             .addImm(2)
489             .addImm(3)
490             .addOperand(RID)
491             .addOperand(SID)
492             .addImm(CTX)
493             .addImm(CTY)
494             .addImm(CTZ)
495             .addImm(CTW)
496             .addReg(T0, RegState::Implicit)
497             .addReg(T1, RegState::Implicit);
498     break;
499   }
500 
501   case AMDGPU::BRANCH:
502       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
503               .addOperand(MI->getOperand(0));
504       break;
505 
506   case AMDGPU::BRANCH_COND_f32: {
507     MachineInstr *NewMI =
508       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
509               AMDGPU::PREDICATE_BIT)
510               .addOperand(MI->getOperand(1))
511               .addImm(OPCODE_IS_NOT_ZERO)
512               .addImm(0); // Flags
513     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
514     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
515             .addOperand(MI->getOperand(0))
516             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
517     break;
518   }
519 
520   case AMDGPU::BRANCH_COND_i32: {
521     MachineInstr *NewMI =
522       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
523             AMDGPU::PREDICATE_BIT)
524             .addOperand(MI->getOperand(1))
525             .addImm(OPCODE_IS_NOT_ZERO_INT)
526             .addImm(0); // Flags
527     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
528     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
529            .addOperand(MI->getOperand(0))
530             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
531     break;
532   }
533 
534   case AMDGPU::EG_ExportSwz:
535   case AMDGPU::R600_ExportSwz: {
536     // Instruction is left unmodified if its not the last one of its type
537     bool isLastInstructionOfItsType = true;
538     unsigned InstExportType = MI->getOperand(1).getImm();
539     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
540          EndBlock = BB->end(); NextExportInst != EndBlock;
541          NextExportInst = std::next(NextExportInst)) {
542       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
543           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
544         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
545             .getImm();
546         if (CurrentInstExportType == InstExportType) {
547           isLastInstructionOfItsType = false;
548           break;
549         }
550       }
551     }
552     bool EOP = isEOP(I);
553     if (!EOP && !isLastInstructionOfItsType)
554       return BB;
555     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
556     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
557             .addOperand(MI->getOperand(0))
558             .addOperand(MI->getOperand(1))
559             .addOperand(MI->getOperand(2))
560             .addOperand(MI->getOperand(3))
561             .addOperand(MI->getOperand(4))
562             .addOperand(MI->getOperand(5))
563             .addOperand(MI->getOperand(6))
564             .addImm(CfInst)
565             .addImm(EOP);
566     break;
567   }
568   case AMDGPU::RETURN: {
569     // RETURN instructions must have the live-out registers as implicit uses,
570     // otherwise they appear dead.
571     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
572     MachineInstrBuilder MIB(*MF, MI);
573     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
574       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
575     return BB;
576   }
577   }
578 
579   MI->eraseFromParent();
580   return BB;
581 }
582 
583 //===----------------------------------------------------------------------===//
584 // Custom DAG Lowering Operations
585 //===----------------------------------------------------------------------===//
586 
587 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
588   MachineFunction &MF = DAG.getMachineFunction();
589   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
590   switch (Op.getOpcode()) {
591   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
592   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
593   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
594   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
595   case ISD::SRA_PARTS:
596   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
597   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
598   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
599   case ISD::FCOS:
600   case ISD::FSIN: return LowerTrig(Op, DAG);
601   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
602   case ISD::STORE: return LowerSTORE(Op, DAG);
603   case ISD::LOAD: {
604     SDValue Result = LowerLOAD(Op, DAG);
605     assert((!Result.getNode() ||
606             Result.getNode()->getNumValues() == 2) &&
607            "Load should return a value and a chain");
608     return Result;
609   }
610 
611   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
612   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
613   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
614   case ISD::INTRINSIC_VOID: {
615     SDValue Chain = Op.getOperand(0);
616     unsigned IntrinsicID =
617                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
618     switch (IntrinsicID) {
619     case AMDGPUIntrinsic::R600_store_swizzle: {
620       SDLoc DL(Op);
621       const SDValue Args[8] = {
622         Chain,
623         Op.getOperand(2), // Export Value
624         Op.getOperand(3), // ArrayBase
625         Op.getOperand(4), // Type
626         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
627         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
628         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
629         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
630       };
631       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
632     }
633 
634     // default for switch(IntrinsicID)
635     default: break;
636     }
637     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
638     break;
639   }
640   case ISD::INTRINSIC_WO_CHAIN: {
641     unsigned IntrinsicID =
642                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
643     EVT VT = Op.getValueType();
644     SDLoc DL(Op);
645     switch(IntrinsicID) {
646     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
647     case AMDGPUIntrinsic::R600_interp_xy:
648     case AMDGPUIntrinsic::R600_interp_zw: {
649       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
650       MachineSDNode *interp;
651       SDValue RegisterINode = Op.getOperand(2);
652       SDValue RegisterJNode = Op.getOperand(3);
653 
654       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
655         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
656             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
657             RegisterJNode, RegisterINode);
658       else
659         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
660             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
661             RegisterJNode, RegisterINode);
662       return DAG.getBuildVector(MVT::v2f32, DL,
663                                 {SDValue(interp, 0), SDValue(interp, 1)});
664     }
665     case AMDGPUIntrinsic::r600_tex:
666     case AMDGPUIntrinsic::r600_texc:
667     case AMDGPUIntrinsic::r600_txl:
668     case AMDGPUIntrinsic::r600_txlc:
669     case AMDGPUIntrinsic::r600_txb:
670     case AMDGPUIntrinsic::r600_txbc:
671     case AMDGPUIntrinsic::r600_txf:
672     case AMDGPUIntrinsic::r600_txq:
673     case AMDGPUIntrinsic::r600_ddx:
674     case AMDGPUIntrinsic::r600_ddy:
675     case AMDGPUIntrinsic::R600_ldptr: {
676       unsigned TextureOp;
677       switch (IntrinsicID) {
678       case AMDGPUIntrinsic::r600_tex:
679         TextureOp = 0;
680         break;
681       case AMDGPUIntrinsic::r600_texc:
682         TextureOp = 1;
683         break;
684       case AMDGPUIntrinsic::r600_txl:
685         TextureOp = 2;
686         break;
687       case AMDGPUIntrinsic::r600_txlc:
688         TextureOp = 3;
689         break;
690       case AMDGPUIntrinsic::r600_txb:
691         TextureOp = 4;
692         break;
693       case AMDGPUIntrinsic::r600_txbc:
694         TextureOp = 5;
695         break;
696       case AMDGPUIntrinsic::r600_txf:
697         TextureOp = 6;
698         break;
699       case AMDGPUIntrinsic::r600_txq:
700         TextureOp = 7;
701         break;
702       case AMDGPUIntrinsic::r600_ddx:
703         TextureOp = 8;
704         break;
705       case AMDGPUIntrinsic::r600_ddy:
706         TextureOp = 9;
707         break;
708       case AMDGPUIntrinsic::R600_ldptr:
709         TextureOp = 10;
710         break;
711       default:
712         llvm_unreachable("Unknow Texture Operation");
713       }
714 
715       SDValue TexArgs[19] = {
716         DAG.getConstant(TextureOp, DL, MVT::i32),
717         Op.getOperand(1),
718         DAG.getConstant(0, DL, MVT::i32),
719         DAG.getConstant(1, DL, MVT::i32),
720         DAG.getConstant(2, DL, MVT::i32),
721         DAG.getConstant(3, DL, MVT::i32),
722         Op.getOperand(2),
723         Op.getOperand(3),
724         Op.getOperand(4),
725         DAG.getConstant(0, DL, MVT::i32),
726         DAG.getConstant(1, DL, MVT::i32),
727         DAG.getConstant(2, DL, MVT::i32),
728         DAG.getConstant(3, DL, MVT::i32),
729         Op.getOperand(5),
730         Op.getOperand(6),
731         Op.getOperand(7),
732         Op.getOperand(8),
733         Op.getOperand(9),
734         Op.getOperand(10)
735       };
736       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
737     }
738     case AMDGPUIntrinsic::AMDGPU_dp4: {
739       SDValue Args[8] = {
740       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
741           DAG.getConstant(0, DL, MVT::i32)),
742       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
743           DAG.getConstant(0, DL, MVT::i32)),
744       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
745           DAG.getConstant(1, DL, MVT::i32)),
746       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
747           DAG.getConstant(1, DL, MVT::i32)),
748       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
749           DAG.getConstant(2, DL, MVT::i32)),
750       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
751           DAG.getConstant(2, DL, MVT::i32)),
752       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
753           DAG.getConstant(3, DL, MVT::i32)),
754       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
755           DAG.getConstant(3, DL, MVT::i32))
756       };
757       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
758     }
759 
760     case Intrinsic::r600_read_ngroups_x:
761       return LowerImplicitParameter(DAG, VT, DL, 0);
762     case Intrinsic::r600_read_ngroups_y:
763       return LowerImplicitParameter(DAG, VT, DL, 1);
764     case Intrinsic::r600_read_ngroups_z:
765       return LowerImplicitParameter(DAG, VT, DL, 2);
766     case Intrinsic::r600_read_global_size_x:
767       return LowerImplicitParameter(DAG, VT, DL, 3);
768     case Intrinsic::r600_read_global_size_y:
769       return LowerImplicitParameter(DAG, VT, DL, 4);
770     case Intrinsic::r600_read_global_size_z:
771       return LowerImplicitParameter(DAG, VT, DL, 5);
772     case Intrinsic::r600_read_local_size_x:
773       return LowerImplicitParameter(DAG, VT, DL, 6);
774     case Intrinsic::r600_read_local_size_y:
775       return LowerImplicitParameter(DAG, VT, DL, 7);
776     case Intrinsic::r600_read_local_size_z:
777       return LowerImplicitParameter(DAG, VT, DL, 8);
778 
779     case Intrinsic::r600_read_workdim:
780     case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
781       uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
782       return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
783     }
784 
785     case Intrinsic::r600_read_tgid_x:
786       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
787                                   AMDGPU::T1_X, VT);
788     case Intrinsic::r600_read_tgid_y:
789       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
790                                   AMDGPU::T1_Y, VT);
791     case Intrinsic::r600_read_tgid_z:
792       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
793                                   AMDGPU::T1_Z, VT);
794     case Intrinsic::r600_read_tidig_x:
795       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
796                                   AMDGPU::T0_X, VT);
797     case Intrinsic::r600_read_tidig_y:
798       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
799                                   AMDGPU::T0_Y, VT);
800     case Intrinsic::r600_read_tidig_z:
801       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
802                                   AMDGPU::T0_Z, VT);
803 
804     // FIXME: Should be renamed to r600 prefix
805     case AMDGPUIntrinsic::AMDGPU_rsq_clamped:
806       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
807 
808     case Intrinsic::r600_rsq:
809     case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
810       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
811       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
812     }
813     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
814     break;
815   }
816   } // end switch(Op.getOpcode())
817   return SDValue();
818 }
819 
820 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
821                                             SmallVectorImpl<SDValue> &Results,
822                                             SelectionDAG &DAG) const {
823   switch (N->getOpcode()) {
824   default:
825     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
826     return;
827   case ISD::FP_TO_UINT:
828     if (N->getValueType(0) == MVT::i1) {
829       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
830       return;
831     }
832     // Fall-through. Since we don't care about out of bounds values
833     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
834     // considers some extra cases which are not necessary here.
835   case ISD::FP_TO_SINT: {
836     SDValue Result;
837     if (expandFP_TO_SINT(N, Result, DAG))
838       Results.push_back(Result);
839     return;
840   }
841   case ISD::SDIVREM: {
842     SDValue Op = SDValue(N, 1);
843     SDValue RES = LowerSDIVREM(Op, DAG);
844     Results.push_back(RES);
845     Results.push_back(RES.getValue(1));
846     break;
847   }
848   case ISD::UDIVREM: {
849     SDValue Op = SDValue(N, 0);
850     LowerUDIVREM64(Op, DAG, Results);
851     break;
852   }
853   }
854 }
855 
856 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
857                                                    SDValue Vector) const {
858 
859   SDLoc DL(Vector);
860   EVT VecVT = Vector.getValueType();
861   EVT EltVT = VecVT.getVectorElementType();
862   SmallVector<SDValue, 8> Args;
863 
864   for (unsigned i = 0, e = VecVT.getVectorNumElements();
865                                                            i != e; ++i) {
866     Args.push_back(DAG.getNode(
867         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
868         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
869   }
870 
871   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
872 }
873 
874 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
875                                                     SelectionDAG &DAG) const {
876 
877   SDLoc DL(Op);
878   SDValue Vector = Op.getOperand(0);
879   SDValue Index = Op.getOperand(1);
880 
881   if (isa<ConstantSDNode>(Index) ||
882       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
883     return Op;
884 
885   Vector = vectorToVerticalVector(DAG, Vector);
886   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
887                      Vector, Index);
888 }
889 
890 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
891                                                    SelectionDAG &DAG) const {
892   SDLoc DL(Op);
893   SDValue Vector = Op.getOperand(0);
894   SDValue Value = Op.getOperand(1);
895   SDValue Index = Op.getOperand(2);
896 
897   if (isa<ConstantSDNode>(Index) ||
898       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
899     return Op;
900 
901   Vector = vectorToVerticalVector(DAG, Vector);
902   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
903                                Vector, Value, Index);
904   return vectorToVerticalVector(DAG, Insert);
905 }
906 
907 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
908   // On hw >= R700, COS/SIN input must be between -1. and 1.
909   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
910   EVT VT = Op.getValueType();
911   SDValue Arg = Op.getOperand(0);
912   SDLoc DL(Op);
913 
914   // TODO: Should this propagate fast-math-flags?
915   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
916       DAG.getNode(ISD::FADD, DL, VT,
917         DAG.getNode(ISD::FMUL, DL, VT, Arg,
918           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
919         DAG.getConstantFP(0.5, DL, MVT::f32)));
920   unsigned TrigNode;
921   switch (Op.getOpcode()) {
922   case ISD::FCOS:
923     TrigNode = AMDGPUISD::COS_HW;
924     break;
925   case ISD::FSIN:
926     TrigNode = AMDGPUISD::SIN_HW;
927     break;
928   default:
929     llvm_unreachable("Wrong trig opcode");
930   }
931   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
932       DAG.getNode(ISD::FADD, DL, VT, FractPart,
933         DAG.getConstantFP(-0.5, DL, MVT::f32)));
934   if (Gen >= AMDGPUSubtarget::R700)
935     return TrigVal;
936   // On R600 hw, COS/SIN input must be between -Pi and Pi.
937   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
938       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
939 }
940 
941 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
942   SDLoc DL(Op);
943   EVT VT = Op.getValueType();
944 
945   SDValue Lo = Op.getOperand(0);
946   SDValue Hi = Op.getOperand(1);
947   SDValue Shift = Op.getOperand(2);
948   SDValue Zero = DAG.getConstant(0, DL, VT);
949   SDValue One  = DAG.getConstant(1, DL, VT);
950 
951   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
952   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
953   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
954   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
955 
956   // The dance around Width1 is necessary for 0 special case.
957   // Without it the CompShift might be 32, producing incorrect results in
958   // Overflow. So we do the shift in two steps, the alternative is to
959   // add a conditional to filter the special case.
960 
961   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
962   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
963 
964   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
965   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
966   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
967 
968   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
969   SDValue LoBig = Zero;
970 
971   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
972   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
973 
974   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
975 }
976 
977 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
978   SDLoc DL(Op);
979   EVT VT = Op.getValueType();
980 
981   SDValue Lo = Op.getOperand(0);
982   SDValue Hi = Op.getOperand(1);
983   SDValue Shift = Op.getOperand(2);
984   SDValue Zero = DAG.getConstant(0, DL, VT);
985   SDValue One  = DAG.getConstant(1, DL, VT);
986 
987   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
988 
989   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
990   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
991   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
992   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
993 
994   // The dance around Width1 is necessary for 0 special case.
995   // Without it the CompShift might be 32, producing incorrect results in
996   // Overflow. So we do the shift in two steps, the alternative is to
997   // add a conditional to filter the special case.
998 
999   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1000   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1001 
1002   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1003   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1004   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1005 
1006   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1007   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1008 
1009   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1010   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1011 
1012   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1013 }
1014 
1015 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1016                                           unsigned mainop, unsigned ovf) const {
1017   SDLoc DL(Op);
1018   EVT VT = Op.getValueType();
1019 
1020   SDValue Lo = Op.getOperand(0);
1021   SDValue Hi = Op.getOperand(1);
1022 
1023   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1024   // Extend sign.
1025   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1026                     DAG.getValueType(MVT::i1));
1027 
1028   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1029 
1030   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1031 }
1032 
1033 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1034   SDLoc DL(Op);
1035   return DAG.getNode(
1036       ISD::SETCC,
1037       DL,
1038       MVT::i1,
1039       Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
1040       DAG.getCondCode(ISD::SETNE)
1041       );
1042 }
1043 
1044 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1045                                                    SDLoc DL,
1046                                                    unsigned DwordOffset) const {
1047   unsigned ByteOffset = DwordOffset * 4;
1048   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1049                                       AMDGPUAS::CONSTANT_BUFFER_0);
1050 
1051   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1052   assert(isInt<16>(ByteOffset));
1053 
1054   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1055                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1056                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1057                      false, false, false, 0);
1058 }
1059 
1060 bool R600TargetLowering::isZero(SDValue Op) const {
1061   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1062     return Cst->isNullValue();
1063   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1064     return CstFP->isZero();
1065   } else {
1066     return false;
1067   }
1068 }
1069 
1070 bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
1071   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1072     return CFP->isExactlyValue(1.0);
1073   }
1074   return isAllOnesConstant(Op);
1075 }
1076 
1077 bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
1078   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1079     return CFP->getValueAPF().isZero();
1080   }
1081   return isNullConstant(Op);
1082 }
1083 
1084 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1085   SDLoc DL(Op);
1086   EVT VT = Op.getValueType();
1087 
1088   SDValue LHS = Op.getOperand(0);
1089   SDValue RHS = Op.getOperand(1);
1090   SDValue True = Op.getOperand(2);
1091   SDValue False = Op.getOperand(3);
1092   SDValue CC = Op.getOperand(4);
1093   SDValue Temp;
1094 
1095   if (VT == MVT::f32) {
1096     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1097     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1098     if (MinMax)
1099       return MinMax;
1100   }
1101 
1102   // LHS and RHS are guaranteed to be the same value type
1103   EVT CompareVT = LHS.getValueType();
1104 
1105   // Check if we can lower this to a native operation.
1106 
1107   // Try to lower to a SET* instruction:
1108   //
1109   // SET* can match the following patterns:
1110   //
1111   // select_cc f32, f32, -1,  0, cc_supported
1112   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1113   // select_cc i32, i32, -1,  0, cc_supported
1114   //
1115 
1116   // Move hardware True/False values to the correct operand.
1117   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1118   ISD::CondCode InverseCC =
1119      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1120   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1121     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1122       std::swap(False, True);
1123       CC = DAG.getCondCode(InverseCC);
1124     } else {
1125       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1126       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1127         std::swap(False, True);
1128         std::swap(LHS, RHS);
1129         CC = DAG.getCondCode(SwapInvCC);
1130       }
1131     }
1132   }
1133 
1134   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1135       (CompareVT == VT || VT == MVT::i32)) {
1136     // This can be matched by a SET* instruction.
1137     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1138   }
1139 
1140   // Try to lower to a CND* instruction:
1141   //
1142   // CND* can match the following patterns:
1143   //
1144   // select_cc f32, 0.0, f32, f32, cc_supported
1145   // select_cc f32, 0.0, i32, i32, cc_supported
1146   // select_cc i32, 0,   f32, f32, cc_supported
1147   // select_cc i32, 0,   i32, i32, cc_supported
1148   //
1149 
1150   // Try to move the zero value to the RHS
1151   if (isZero(LHS)) {
1152     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1153     // Try swapping the operands
1154     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1155     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1156       std::swap(LHS, RHS);
1157       CC = DAG.getCondCode(CCSwapped);
1158     } else {
1159       // Try inverting the conditon and then swapping the operands
1160       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1161       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1162       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1163         std::swap(True, False);
1164         std::swap(LHS, RHS);
1165         CC = DAG.getCondCode(CCSwapped);
1166       }
1167     }
1168   }
1169   if (isZero(RHS)) {
1170     SDValue Cond = LHS;
1171     SDValue Zero = RHS;
1172     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1173     if (CompareVT != VT) {
1174       // Bitcast True / False to the correct types.  This will end up being
1175       // a nop, but it allows us to define only a single pattern in the
1176       // .TD files for each CND* instruction rather than having to have
1177       // one pattern for integer True/False and one for fp True/False
1178       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1179       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1180     }
1181 
1182     switch (CCOpcode) {
1183     case ISD::SETONE:
1184     case ISD::SETUNE:
1185     case ISD::SETNE:
1186       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1187       Temp = True;
1188       True = False;
1189       False = Temp;
1190       break;
1191     default:
1192       break;
1193     }
1194     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1195         Cond, Zero,
1196         True, False,
1197         DAG.getCondCode(CCOpcode));
1198     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1199   }
1200 
1201   // If we make it this for it means we have no native instructions to handle
1202   // this SELECT_CC, so we must lower it.
1203   SDValue HWTrue, HWFalse;
1204 
1205   if (CompareVT == MVT::f32) {
1206     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1207     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1208   } else if (CompareVT == MVT::i32) {
1209     HWTrue = DAG.getConstant(-1, DL, CompareVT);
1210     HWFalse = DAG.getConstant(0, DL, CompareVT);
1211   }
1212   else {
1213     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1214   }
1215 
1216   // Lower this unsupported SELECT_CC into a combination of two supported
1217   // SELECT_CC operations.
1218   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1219 
1220   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1221       Cond, HWFalse,
1222       True, False,
1223       DAG.getCondCode(ISD::SETNE));
1224 }
1225 
1226 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1227 /// convert these pointers to a register index.  Each register holds
1228 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1229 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1230 /// for indirect addressing.
1231 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1232                                                unsigned StackWidth,
1233                                                SelectionDAG &DAG) const {
1234   unsigned SRLPad;
1235   switch(StackWidth) {
1236   case 1:
1237     SRLPad = 2;
1238     break;
1239   case 2:
1240     SRLPad = 3;
1241     break;
1242   case 4:
1243     SRLPad = 4;
1244     break;
1245   default: llvm_unreachable("Invalid stack width");
1246   }
1247 
1248   SDLoc DL(Ptr);
1249   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1250                      DAG.getConstant(SRLPad, DL, MVT::i32));
1251 }
1252 
1253 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1254                                          unsigned ElemIdx,
1255                                          unsigned &Channel,
1256                                          unsigned &PtrIncr) const {
1257   switch (StackWidth) {
1258   default:
1259   case 1:
1260     Channel = 0;
1261     if (ElemIdx > 0) {
1262       PtrIncr = 1;
1263     } else {
1264       PtrIncr = 0;
1265     }
1266     break;
1267   case 2:
1268     Channel = ElemIdx % 2;
1269     if (ElemIdx == 2) {
1270       PtrIncr = 1;
1271     } else {
1272       PtrIncr = 0;
1273     }
1274     break;
1275   case 4:
1276     Channel = ElemIdx;
1277     PtrIncr = 0;
1278     break;
1279   }
1280 }
1281 
1282 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
1283                                                    SelectionDAG &DAG) const {
1284   SDLoc DL(Store);
1285 
1286   unsigned Mask = 0;
1287   if (Store->getMemoryVT() == MVT::i8) {
1288     Mask = 0xff;
1289   } else if (Store->getMemoryVT() == MVT::i16) {
1290     Mask = 0xffff;
1291   }
1292 
1293   SDValue Chain = Store->getChain();
1294   SDValue BasePtr = Store->getBasePtr();
1295   EVT MemVT = Store->getMemoryVT();
1296 
1297   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
1298                             DAG.getConstant(2, DL, MVT::i32));
1299   SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
1300                             Chain, Ptr,
1301                             DAG.getTargetConstant(0, DL, MVT::i32));
1302 
1303   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
1304                                 DAG.getConstant(0x3, DL, MVT::i32));
1305 
1306   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1307                                  DAG.getConstant(3, DL, MVT::i32));
1308 
1309   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1310                                   Store->getValue());
1311 
1312   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
1313 
1314   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1315                                      MaskedValue, ShiftAmt);
1316 
1317   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
1318                                 DAG.getConstant(Mask, DL, MVT::i32),
1319                                 ShiftAmt);
1320   DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
1321                         DAG.getConstant(0xffffffff, DL, MVT::i32));
1322   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1323 
1324   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1325   return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1326                      Chain, Value, Ptr,
1327                      DAG.getTargetConstant(0, DL, MVT::i32));
1328 }
1329 
1330 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1331   if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
1332     return Result;
1333 
1334   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1335   unsigned AS = StoreNode->getAddressSpace();
1336   SDValue Value = StoreNode->getValue();
1337   EVT ValueVT = Value.getValueType();
1338 
1339   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
1340       ValueVT.isVector()) {
1341     return SplitVectorStore(Op, DAG);
1342   }
1343 
1344   SDLoc DL(Op);
1345   SDValue Chain = StoreNode->getChain();
1346   SDValue Ptr = StoreNode->getBasePtr();
1347 
1348   if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
1349     if (StoreNode->isTruncatingStore()) {
1350       EVT VT = Value.getValueType();
1351       assert(VT.bitsLE(MVT::i32));
1352       EVT MemVT = StoreNode->getMemoryVT();
1353       SDValue MaskConstant;
1354       if (MemVT == MVT::i8) {
1355         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1356       } else {
1357         assert(MemVT == MVT::i16);
1358         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1359       }
1360       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1361                                       DAG.getConstant(2, DL, MVT::i32));
1362       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1363                                       DAG.getConstant(0x00000003, DL, VT));
1364       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1365       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1366                                    DAG.getConstant(3, DL, VT));
1367       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1368       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1369       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1370       // vector instead.
1371       SDValue Src[4] = {
1372         ShiftedValue,
1373         DAG.getConstant(0, DL, MVT::i32),
1374         DAG.getConstant(0, DL, MVT::i32),
1375         Mask
1376       };
1377       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
1378       SDValue Args[3] = { Chain, Input, DWordAddr };
1379       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1380                                      Op->getVTList(), Args, MemVT,
1381                                      StoreNode->getMemOperand());
1382     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1383                ValueVT.bitsGE(MVT::i32)) {
1384       // Convert pointer from byte address to dword address.
1385       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1386                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1387                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
1388 
1389       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1390         llvm_unreachable("Truncated and indexed stores not supported yet");
1391       } else {
1392         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1393       }
1394       return Chain;
1395     }
1396   }
1397 
1398   if (AS != AMDGPUAS::PRIVATE_ADDRESS)
1399     return SDValue();
1400 
1401   EVT MemVT = StoreNode->getMemoryVT();
1402   if (MemVT.bitsLT(MVT::i32))
1403     return lowerPrivateTruncStore(StoreNode, DAG);
1404 
1405   // Lowering for indirect addressing
1406   const MachineFunction &MF = DAG.getMachineFunction();
1407   const AMDGPUFrameLowering *TFL =
1408       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1409   unsigned StackWidth = TFL->getStackWidth(MF);
1410 
1411   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1412 
1413   if (ValueVT.isVector()) {
1414     unsigned NumElemVT = ValueVT.getVectorNumElements();
1415     EVT ElemVT = ValueVT.getVectorElementType();
1416     SmallVector<SDValue, 4> Stores(NumElemVT);
1417 
1418     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1419                                       "vector width in load");
1420 
1421     for (unsigned i = 0; i < NumElemVT; ++i) {
1422       unsigned Channel, PtrIncr;
1423       getStackAddress(StackWidth, i, Channel, PtrIncr);
1424       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1425                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1426       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1427                                  Value, DAG.getConstant(i, DL, MVT::i32));
1428 
1429       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1430                               Chain, Elem, Ptr,
1431                               DAG.getTargetConstant(Channel, DL, MVT::i32));
1432     }
1433      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1434    } else {
1435     if (ValueVT == MVT::i8) {
1436       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1437     }
1438     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1439     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1440   }
1441 
1442   return Chain;
1443 }
1444 
1445 // return (512 + (kc_bank << 12)
1446 static int
1447 ConstantAddressBlock(unsigned AddressSpace) {
1448   switch (AddressSpace) {
1449   case AMDGPUAS::CONSTANT_BUFFER_0:
1450     return 512;
1451   case AMDGPUAS::CONSTANT_BUFFER_1:
1452     return 512 + 4096;
1453   case AMDGPUAS::CONSTANT_BUFFER_2:
1454     return 512 + 4096 * 2;
1455   case AMDGPUAS::CONSTANT_BUFFER_3:
1456     return 512 + 4096 * 3;
1457   case AMDGPUAS::CONSTANT_BUFFER_4:
1458     return 512 + 4096 * 4;
1459   case AMDGPUAS::CONSTANT_BUFFER_5:
1460     return 512 + 4096 * 5;
1461   case AMDGPUAS::CONSTANT_BUFFER_6:
1462     return 512 + 4096 * 6;
1463   case AMDGPUAS::CONSTANT_BUFFER_7:
1464     return 512 + 4096 * 7;
1465   case AMDGPUAS::CONSTANT_BUFFER_8:
1466     return 512 + 4096 * 8;
1467   case AMDGPUAS::CONSTANT_BUFFER_9:
1468     return 512 + 4096 * 9;
1469   case AMDGPUAS::CONSTANT_BUFFER_10:
1470     return 512 + 4096 * 10;
1471   case AMDGPUAS::CONSTANT_BUFFER_11:
1472     return 512 + 4096 * 11;
1473   case AMDGPUAS::CONSTANT_BUFFER_12:
1474     return 512 + 4096 * 12;
1475   case AMDGPUAS::CONSTANT_BUFFER_13:
1476     return 512 + 4096 * 13;
1477   case AMDGPUAS::CONSTANT_BUFFER_14:
1478     return 512 + 4096 * 14;
1479   case AMDGPUAS::CONSTANT_BUFFER_15:
1480     return 512 + 4096 * 15;
1481   default:
1482     return -1;
1483   }
1484 }
1485 
1486 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
1487                                                 SelectionDAG &DAG) const {
1488   SDLoc DL(Op);
1489   LoadSDNode *Load = cast<LoadSDNode>(Op);
1490   ISD::LoadExtType ExtType = Load->getExtensionType();
1491   EVT MemVT = Load->getMemoryVT();
1492 
1493   // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
1494   // register (2-)byte extract.
1495 
1496   // Get Register holding the target.
1497   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
1498                             DAG.getConstant(2, DL, MVT::i32));
1499   // Load the Register.
1500   SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
1501                             Load->getChain(),
1502                             Ptr,
1503                             DAG.getTargetConstant(0, DL, MVT::i32),
1504                             Op.getOperand(2));
1505 
1506   // Get offset within the register.
1507   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1508                                 Load->getBasePtr(),
1509                                 DAG.getConstant(0x3, DL, MVT::i32));
1510 
1511   // Bit offset of target byte (byteIdx * 8).
1512   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1513                                  DAG.getConstant(3, DL, MVT::i32));
1514 
1515   // Shift to the right.
1516   Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
1517 
1518   // Eliminate the upper bits by setting them to ...
1519   EVT MemEltVT = MemVT.getScalarType();
1520 
1521   // ... ones.
1522   if (ExtType == ISD::SEXTLOAD) {
1523     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1524 
1525     SDValue Ops[] = {
1526       DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
1527       Load->getChain()
1528     };
1529 
1530     return DAG.getMergeValues(Ops, DL);
1531   }
1532 
1533   // ... or zeros.
1534   SDValue Ops[] = {
1535     DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
1536     Load->getChain()
1537   };
1538 
1539   return DAG.getMergeValues(Ops, DL);
1540 }
1541 
1542 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1543   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1544   unsigned AS = LoadNode->getAddressSpace();
1545   EVT MemVT = LoadNode->getMemoryVT();
1546   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
1547 
1548   if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
1549       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
1550     return lowerPrivateExtLoad(Op, DAG);
1551   }
1552 
1553   SDLoc DL(Op);
1554   EVT VT = Op.getValueType();
1555   SDValue Chain = LoadNode->getChain();
1556   SDValue Ptr = LoadNode->getBasePtr();
1557 
1558   // Lower loads constant address space global variable loads
1559   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1560       isa<GlobalVariable>(GetUnderlyingObject(
1561           LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) {
1562 
1563     SDValue Ptr = DAG.getZExtOrTrunc(
1564         LoadNode->getBasePtr(), DL,
1565         getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS));
1566     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1567         DAG.getConstant(2, DL, MVT::i32));
1568     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1569                        LoadNode->getChain(), Ptr,
1570                        DAG.getTargetConstant(0, DL, MVT::i32),
1571                        Op.getOperand(2));
1572   }
1573 
1574   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1575     SDValue MergedValues[2] = {
1576       scalarizeVectorLoad(LoadNode, DAG),
1577       Chain
1578     };
1579     return DAG.getMergeValues(MergedValues, DL);
1580   }
1581 
1582   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1583   if (ConstantBlock > -1 &&
1584       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1585        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1586     SDValue Result;
1587     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1588         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1589         isa<ConstantSDNode>(Ptr)) {
1590       SDValue Slots[4];
1591       for (unsigned i = 0; i < 4; i++) {
1592         // We want Const position encoded with the following formula :
1593         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1594         // const_index is Ptr computed by llvm using an alignment of 16.
1595         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1596         // then div by 4 at the ISel step
1597         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1598             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1599         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1600       }
1601       EVT NewVT = MVT::v4i32;
1602       unsigned NumElements = 4;
1603       if (VT.isVector()) {
1604         NewVT = VT;
1605         NumElements = VT.getVectorNumElements();
1606       }
1607       Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
1608     } else {
1609       // non-constant ptr can't be folded, keeps it as a v4f32 load
1610       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1611           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1612                       DAG.getConstant(4, DL, MVT::i32)),
1613                       DAG.getConstant(LoadNode->getAddressSpace() -
1614                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1615           );
1616     }
1617 
1618     if (!VT.isVector()) {
1619       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1620                            DAG.getConstant(0, DL, MVT::i32));
1621     }
1622 
1623     SDValue MergedValues[2] = {
1624       Result,
1625       Chain
1626     };
1627     return DAG.getMergeValues(MergedValues, DL);
1628   }
1629 
1630   SDValue LoweredLoad;
1631 
1632   // For most operations returning SDValue() will result in the node being
1633   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1634   // need to manually expand loads that may be legal in some address spaces and
1635   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1636   // compute shaders, since the data is sign extended when it is uploaded to the
1637   // buffer. However SEXT loads from other address spaces are not supported, so
1638   // we need to expand them here.
1639   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1640     EVT MemVT = LoadNode->getMemoryVT();
1641     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1642     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1643                                   LoadNode->getPointerInfo(), MemVT,
1644                                   LoadNode->isVolatile(),
1645                                   LoadNode->isNonTemporal(),
1646                                   LoadNode->isInvariant(),
1647                                   LoadNode->getAlignment());
1648     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1649                               DAG.getValueType(MemVT));
1650 
1651     SDValue MergedValues[2] = { Res, Chain };
1652     return DAG.getMergeValues(MergedValues, DL);
1653   }
1654 
1655   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1656     return SDValue();
1657   }
1658 
1659   // Lowering for indirect addressing
1660   const MachineFunction &MF = DAG.getMachineFunction();
1661   const AMDGPUFrameLowering *TFL =
1662       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1663   unsigned StackWidth = TFL->getStackWidth(MF);
1664 
1665   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1666 
1667   if (VT.isVector()) {
1668     unsigned NumElemVT = VT.getVectorNumElements();
1669     EVT ElemVT = VT.getVectorElementType();
1670     SDValue Loads[4];
1671 
1672     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1673                                       "vector width in load");
1674 
1675     for (unsigned i = 0; i < NumElemVT; ++i) {
1676       unsigned Channel, PtrIncr;
1677       getStackAddress(StackWidth, i, Channel, PtrIncr);
1678       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1679                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1680       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1681                              Chain, Ptr,
1682                              DAG.getTargetConstant(Channel, DL, MVT::i32),
1683                              Op.getOperand(2));
1684     }
1685     for (unsigned i = NumElemVT; i < 4; ++i) {
1686       Loads[i] = DAG.getUNDEF(ElemVT);
1687     }
1688     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1689     LoweredLoad = DAG.getBuildVector(TargetVT, DL, Loads);
1690   } else {
1691     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1692                               Chain, Ptr,
1693                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1694                               Op.getOperand(2));
1695   }
1696 
1697   SDValue Ops[2] = {
1698     LoweredLoad,
1699     Chain
1700   };
1701 
1702   return DAG.getMergeValues(Ops, DL);
1703 }
1704 
1705 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1706   SDValue Chain = Op.getOperand(0);
1707   SDValue Cond  = Op.getOperand(1);
1708   SDValue Jump  = Op.getOperand(2);
1709 
1710   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1711                      Chain, Jump, Cond);
1712 }
1713 
1714 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
1715                                             SelectionDAG &DAG) const {
1716   MachineFunction &MF = DAG.getMachineFunction();
1717   const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
1718 
1719   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
1720 
1721   unsigned FrameIndex = FIN->getIndex();
1722   unsigned IgnoredFrameReg;
1723   unsigned Offset =
1724     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
1725   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
1726                          Op.getValueType());
1727 }
1728 
1729 /// XXX Only kernel functions are supported, so we can assume for now that
1730 /// every function is a kernel function, but in the future we should use
1731 /// separate calling conventions for kernel and non-kernel functions.
1732 SDValue R600TargetLowering::LowerFormalArguments(
1733                                       SDValue Chain,
1734                                       CallingConv::ID CallConv,
1735                                       bool isVarArg,
1736                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1737                                       SDLoc DL, SelectionDAG &DAG,
1738                                       SmallVectorImpl<SDValue> &InVals) const {
1739   SmallVector<CCValAssign, 16> ArgLocs;
1740   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1741                  *DAG.getContext());
1742   MachineFunction &MF = DAG.getMachineFunction();
1743   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1744 
1745   SmallVector<ISD::InputArg, 8> LocalIns;
1746 
1747   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1748 
1749   AnalyzeFormalArguments(CCInfo, LocalIns);
1750 
1751   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1752     CCValAssign &VA = ArgLocs[i];
1753     const ISD::InputArg &In = Ins[i];
1754     EVT VT = In.VT;
1755     EVT MemVT = VA.getLocVT();
1756     if (!VT.isVector() && MemVT.isVector()) {
1757       // Get load source type if scalarized.
1758       MemVT = MemVT.getVectorElementType();
1759     }
1760 
1761     if (AMDGPU::isShader(CallConv)) {
1762       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1763       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1764       InVals.push_back(Register);
1765       continue;
1766     }
1767 
1768     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1769                                           AMDGPUAS::CONSTANT_BUFFER_0);
1770 
1771     // i64 isn't a legal type, so the register type used ends up as i32, which
1772     // isn't expected here. It attempts to create this sextload, but it ends up
1773     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1774     // for <1 x i64>.
1775 
1776     // The first 36 bytes of the input buffer contains information about
1777     // thread group and global sizes.
1778     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1779     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1780       // FIXME: This should really check the extload type, but the handling of
1781       // extload vector parameters seems to be broken.
1782 
1783       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1784       Ext = ISD::SEXTLOAD;
1785     }
1786 
1787     // Compute the offset from the value.
1788     // XXX - I think PartOffset should give you this, but it seems to give the
1789     // size of the register which isn't useful.
1790 
1791     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1792     unsigned PartOffset = VA.getLocMemOffset();
1793     unsigned Offset = 36 + VA.getLocMemOffset();
1794 
1795     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1796     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1797                               DAG.getConstant(Offset, DL, MVT::i32),
1798                               DAG.getUNDEF(MVT::i32),
1799                               PtrInfo,
1800                               MemVT, false, true, true, 4);
1801 
1802     // 4 is the preferred alignment for the CONSTANT memory space.
1803     InVals.push_back(Arg);
1804     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1805   }
1806   return Chain;
1807 }
1808 
1809 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1810                                            EVT VT) const {
1811    if (!VT.isVector())
1812      return MVT::i32;
1813    return VT.changeVectorElementTypeToInteger();
1814 }
1815 
1816 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1817                                                         unsigned AddrSpace,
1818                                                         unsigned Align,
1819                                                         bool *IsFast) const {
1820   if (IsFast)
1821     *IsFast = false;
1822 
1823   if (!VT.isSimple() || VT == MVT::Other)
1824     return false;
1825 
1826   if (VT.bitsLT(MVT::i32))
1827     return false;
1828 
1829   // TODO: This is a rough estimate.
1830   if (IsFast)
1831     *IsFast = true;
1832 
1833   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1834 }
1835 
1836 static SDValue CompactSwizzlableVector(
1837   SelectionDAG &DAG, SDValue VectorEntry,
1838   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1839   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1840   assert(RemapSwizzle.empty());
1841   SDValue NewBldVec[4] = {
1842     VectorEntry.getOperand(0),
1843     VectorEntry.getOperand(1),
1844     VectorEntry.getOperand(2),
1845     VectorEntry.getOperand(3)
1846   };
1847 
1848   for (unsigned i = 0; i < 4; i++) {
1849     if (NewBldVec[i].isUndef())
1850       // We mask write here to teach later passes that the ith element of this
1851       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1852       // break false dependencies and additionnaly make assembly easier to read.
1853       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1854     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1855       if (C->isZero()) {
1856         RemapSwizzle[i] = 4; // SEL_0
1857         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1858       } else if (C->isExactlyValue(1.0)) {
1859         RemapSwizzle[i] = 5; // SEL_1
1860         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1861       }
1862     }
1863 
1864     if (NewBldVec[i].isUndef())
1865       continue;
1866     for (unsigned j = 0; j < i; j++) {
1867       if (NewBldVec[i] == NewBldVec[j]) {
1868         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1869         RemapSwizzle[i] = j;
1870         break;
1871       }
1872     }
1873   }
1874 
1875   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1876                             NewBldVec);
1877 }
1878 
1879 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1880                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1881   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1882   assert(RemapSwizzle.empty());
1883   SDValue NewBldVec[4] = {
1884       VectorEntry.getOperand(0),
1885       VectorEntry.getOperand(1),
1886       VectorEntry.getOperand(2),
1887       VectorEntry.getOperand(3)
1888   };
1889   bool isUnmovable[4] = { false, false, false, false };
1890   for (unsigned i = 0; i < 4; i++) {
1891     RemapSwizzle[i] = i;
1892     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1893       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1894           ->getZExtValue();
1895       if (i == Idx)
1896         isUnmovable[Idx] = true;
1897     }
1898   }
1899 
1900   for (unsigned i = 0; i < 4; i++) {
1901     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1902       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1903           ->getZExtValue();
1904       if (isUnmovable[Idx])
1905         continue;
1906       // Swap i and Idx
1907       std::swap(NewBldVec[Idx], NewBldVec[i]);
1908       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1909       break;
1910     }
1911   }
1912 
1913   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1914                             NewBldVec);
1915 }
1916 
1917 
1918 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1919                                             SDValue Swz[4], SelectionDAG &DAG,
1920                                             SDLoc DL) const {
1921   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1922   // Old -> New swizzle values
1923   DenseMap<unsigned, unsigned> SwizzleRemap;
1924 
1925   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1926   for (unsigned i = 0; i < 4; i++) {
1927     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1928     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1929       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1930   }
1931 
1932   SwizzleRemap.clear();
1933   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1934   for (unsigned i = 0; i < 4; i++) {
1935     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1936     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1937       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1938   }
1939 
1940   return BuildVector;
1941 }
1942 
1943 
1944 //===----------------------------------------------------------------------===//
1945 // Custom DAG Optimizations
1946 //===----------------------------------------------------------------------===//
1947 
1948 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1949                                               DAGCombinerInfo &DCI) const {
1950   SelectionDAG &DAG = DCI.DAG;
1951 
1952   switch (N->getOpcode()) {
1953   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1954   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1955   case ISD::FP_ROUND: {
1956       SDValue Arg = N->getOperand(0);
1957       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1958         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1959                            Arg.getOperand(0));
1960       }
1961       break;
1962     }
1963 
1964   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1965   // (i32 select_cc f32, f32, -1, 0 cc)
1966   //
1967   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1968   // this to one of the SET*_DX10 instructions.
1969   case ISD::FP_TO_SINT: {
1970     SDValue FNeg = N->getOperand(0);
1971     if (FNeg.getOpcode() != ISD::FNEG) {
1972       return SDValue();
1973     }
1974     SDValue SelectCC = FNeg.getOperand(0);
1975     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1976         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1977         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1978         !isHWTrueValue(SelectCC.getOperand(2)) ||
1979         !isHWFalseValue(SelectCC.getOperand(3))) {
1980       return SDValue();
1981     }
1982 
1983     SDLoc dl(N);
1984     return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1985                            SelectCC.getOperand(0), // LHS
1986                            SelectCC.getOperand(1), // RHS
1987                            DAG.getConstant(-1, dl, MVT::i32), // True
1988                            DAG.getConstant(0, dl, MVT::i32),  // False
1989                            SelectCC.getOperand(4)); // CC
1990 
1991     break;
1992   }
1993 
1994   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1995   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1996   case ISD::INSERT_VECTOR_ELT: {
1997     SDValue InVec = N->getOperand(0);
1998     SDValue InVal = N->getOperand(1);
1999     SDValue EltNo = N->getOperand(2);
2000     SDLoc dl(N);
2001 
2002     // If the inserted element is an UNDEF, just use the input vector.
2003     if (InVal.isUndef())
2004       return InVec;
2005 
2006     EVT VT = InVec.getValueType();
2007 
2008     // If we can't generate a legal BUILD_VECTOR, exit
2009     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
2010       return SDValue();
2011 
2012     // Check that we know which element is being inserted
2013     if (!isa<ConstantSDNode>(EltNo))
2014       return SDValue();
2015     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
2016 
2017     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
2018     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
2019     // vector elements.
2020     SmallVector<SDValue, 8> Ops;
2021     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
2022       Ops.append(InVec.getNode()->op_begin(),
2023                  InVec.getNode()->op_end());
2024     } else if (InVec.isUndef()) {
2025       unsigned NElts = VT.getVectorNumElements();
2026       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
2027     } else {
2028       return SDValue();
2029     }
2030 
2031     // Insert the element
2032     if (Elt < Ops.size()) {
2033       // All the operands of BUILD_VECTOR must have the same type;
2034       // we enforce that here.
2035       EVT OpVT = Ops[0].getValueType();
2036       if (InVal.getValueType() != OpVT)
2037         InVal = OpVT.bitsGT(InVal.getValueType()) ?
2038           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
2039           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
2040       Ops[Elt] = InVal;
2041     }
2042 
2043     // Return the new vector
2044     return DAG.getBuildVector(VT, dl, Ops);
2045   }
2046 
2047   // Extract_vec (Build_vector) generated by custom lowering
2048   // also needs to be customly combined
2049   case ISD::EXTRACT_VECTOR_ELT: {
2050     SDValue Arg = N->getOperand(0);
2051     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
2052       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2053         unsigned Element = Const->getZExtValue();
2054         return Arg->getOperand(Element);
2055       }
2056     }
2057     if (Arg.getOpcode() == ISD::BITCAST &&
2058         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
2059       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2060         unsigned Element = Const->getZExtValue();
2061         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
2062             Arg->getOperand(0).getOperand(Element));
2063       }
2064     }
2065     break;
2066   }
2067 
2068   case ISD::SELECT_CC: {
2069     // Try common optimizations
2070     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
2071       return Ret;
2072 
2073     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
2074     //      selectcc x, y, a, b, inv(cc)
2075     //
2076     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
2077     //      selectcc x, y, a, b, cc
2078     SDValue LHS = N->getOperand(0);
2079     if (LHS.getOpcode() != ISD::SELECT_CC) {
2080       return SDValue();
2081     }
2082 
2083     SDValue RHS = N->getOperand(1);
2084     SDValue True = N->getOperand(2);
2085     SDValue False = N->getOperand(3);
2086     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2087 
2088     if (LHS.getOperand(2).getNode() != True.getNode() ||
2089         LHS.getOperand(3).getNode() != False.getNode() ||
2090         RHS.getNode() != False.getNode()) {
2091       return SDValue();
2092     }
2093 
2094     switch (NCC) {
2095     default: return SDValue();
2096     case ISD::SETNE: return LHS;
2097     case ISD::SETEQ: {
2098       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2099       LHSCC = ISD::getSetCCInverse(LHSCC,
2100                                   LHS.getOperand(0).getValueType().isInteger());
2101       if (DCI.isBeforeLegalizeOps() ||
2102           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2103         return DAG.getSelectCC(SDLoc(N),
2104                                LHS.getOperand(0),
2105                                LHS.getOperand(1),
2106                                LHS.getOperand(2),
2107                                LHS.getOperand(3),
2108                                LHSCC);
2109       break;
2110     }
2111     }
2112     return SDValue();
2113   }
2114 
2115   case AMDGPUISD::EXPORT: {
2116     SDValue Arg = N->getOperand(1);
2117     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2118       break;
2119 
2120     SDValue NewArgs[8] = {
2121       N->getOperand(0), // Chain
2122       SDValue(),
2123       N->getOperand(2), // ArrayBase
2124       N->getOperand(3), // Type
2125       N->getOperand(4), // SWZ_X
2126       N->getOperand(5), // SWZ_Y
2127       N->getOperand(6), // SWZ_Z
2128       N->getOperand(7) // SWZ_W
2129     };
2130     SDLoc DL(N);
2131     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2132     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2133   }
2134   case AMDGPUISD::TEXTURE_FETCH: {
2135     SDValue Arg = N->getOperand(1);
2136     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2137       break;
2138 
2139     SDValue NewArgs[19] = {
2140       N->getOperand(0),
2141       N->getOperand(1),
2142       N->getOperand(2),
2143       N->getOperand(3),
2144       N->getOperand(4),
2145       N->getOperand(5),
2146       N->getOperand(6),
2147       N->getOperand(7),
2148       N->getOperand(8),
2149       N->getOperand(9),
2150       N->getOperand(10),
2151       N->getOperand(11),
2152       N->getOperand(12),
2153       N->getOperand(13),
2154       N->getOperand(14),
2155       N->getOperand(15),
2156       N->getOperand(16),
2157       N->getOperand(17),
2158       N->getOperand(18),
2159     };
2160     SDLoc DL(N);
2161     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2162     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2163   }
2164   }
2165 
2166   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2167 }
2168 
2169 static bool
2170 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2171             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2172   const R600InstrInfo *TII =
2173       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2174   if (!Src.isMachineOpcode())
2175     return false;
2176   switch (Src.getMachineOpcode()) {
2177   case AMDGPU::FNEG_R600:
2178     if (!Neg.getNode())
2179       return false;
2180     Src = Src.getOperand(0);
2181     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2182     return true;
2183   case AMDGPU::FABS_R600:
2184     if (!Abs.getNode())
2185       return false;
2186     Src = Src.getOperand(0);
2187     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2188     return true;
2189   case AMDGPU::CONST_COPY: {
2190     unsigned Opcode = ParentNode->getMachineOpcode();
2191     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2192 
2193     if (!Sel.getNode())
2194       return false;
2195 
2196     SDValue CstOffset = Src.getOperand(0);
2197     if (ParentNode->getValueType(0).isVector())
2198       return false;
2199 
2200     // Gather constants values
2201     int SrcIndices[] = {
2202       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2203       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2212       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2213     };
2214     std::vector<unsigned> Consts;
2215     for (int OtherSrcIdx : SrcIndices) {
2216       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2217       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2218         continue;
2219       if (HasDst) {
2220         OtherSrcIdx--;
2221         OtherSelIdx--;
2222       }
2223       if (RegisterSDNode *Reg =
2224           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2225         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2226           ConstantSDNode *Cst
2227             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2228           Consts.push_back(Cst->getZExtValue());
2229         }
2230       }
2231     }
2232 
2233     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2234     Consts.push_back(Cst->getZExtValue());
2235     if (!TII->fitsConstReadLimitations(Consts)) {
2236       return false;
2237     }
2238 
2239     Sel = CstOffset;
2240     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2241     return true;
2242   }
2243   case AMDGPU::MOV_IMM_I32:
2244   case AMDGPU::MOV_IMM_F32: {
2245     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2246     uint64_t ImmValue = 0;
2247 
2248 
2249     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2250       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2251       float FloatValue = FPC->getValueAPF().convertToFloat();
2252       if (FloatValue == 0.0) {
2253         ImmReg = AMDGPU::ZERO;
2254       } else if (FloatValue == 0.5) {
2255         ImmReg = AMDGPU::HALF;
2256       } else if (FloatValue == 1.0) {
2257         ImmReg = AMDGPU::ONE;
2258       } else {
2259         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2260       }
2261     } else {
2262       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2263       uint64_t Value = C->getZExtValue();
2264       if (Value == 0) {
2265         ImmReg = AMDGPU::ZERO;
2266       } else if (Value == 1) {
2267         ImmReg = AMDGPU::ONE_INT;
2268       } else {
2269         ImmValue = Value;
2270       }
2271     }
2272 
2273     // Check that we aren't already using an immediate.
2274     // XXX: It's possible for an instruction to have more than one
2275     // immediate operand, but this is not supported yet.
2276     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2277       if (!Imm.getNode())
2278         return false;
2279       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2280       assert(C);
2281       if (C->getZExtValue())
2282         return false;
2283       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2284     }
2285     Src = DAG.getRegister(ImmReg, MVT::i32);
2286     return true;
2287   }
2288   default:
2289     return false;
2290   }
2291 }
2292 
2293 
2294 /// \brief Fold the instructions after selecting them
2295 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2296                                             SelectionDAG &DAG) const {
2297   const R600InstrInfo *TII =
2298       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2299   if (!Node->isMachineOpcode())
2300     return Node;
2301   unsigned Opcode = Node->getMachineOpcode();
2302   SDValue FakeOp;
2303 
2304   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2305 
2306   if (Opcode == AMDGPU::DOT_4) {
2307     int OperandIdx[] = {
2308       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2309       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2310       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2311       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2312       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2313       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2314       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2315       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2316         };
2317     int NegIdx[] = {
2318       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2319       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2320       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2321       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2322       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2323       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2324       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2325       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2326     };
2327     int AbsIdx[] = {
2328       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2329       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2330       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2331       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2332       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2333       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2334       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2335       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2336     };
2337     for (unsigned i = 0; i < 8; i++) {
2338       if (OperandIdx[i] < 0)
2339         return Node;
2340       SDValue &Src = Ops[OperandIdx[i] - 1];
2341       SDValue &Neg = Ops[NegIdx[i] - 1];
2342       SDValue &Abs = Ops[AbsIdx[i] - 1];
2343       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2344       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2345       if (HasDst)
2346         SelIdx--;
2347       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2348       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2349         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2350     }
2351   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2352     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2353       SDValue &Src = Ops[i];
2354       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2355         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2356     }
2357   } else if (Opcode == AMDGPU::CLAMP_R600) {
2358     SDValue Src = Node->getOperand(0);
2359     if (!Src.isMachineOpcode() ||
2360         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2361       return Node;
2362     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2363         AMDGPU::OpName::clamp);
2364     if (ClampIdx < 0)
2365       return Node;
2366     SDLoc DL(Node);
2367     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2368     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2369     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2370                               Node->getVTList(), Ops);
2371   } else {
2372     if (!TII->hasInstrModifiers(Opcode))
2373       return Node;
2374     int OperandIdx[] = {
2375       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2376       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2377       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2378     };
2379     int NegIdx[] = {
2380       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2381       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2382       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2383     };
2384     int AbsIdx[] = {
2385       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2386       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2387       -1
2388     };
2389     for (unsigned i = 0; i < 3; i++) {
2390       if (OperandIdx[i] < 0)
2391         return Node;
2392       SDValue &Src = Ops[OperandIdx[i] - 1];
2393       SDValue &Neg = Ops[NegIdx[i] - 1];
2394       SDValue FakeAbs;
2395       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2396       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2397       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2398       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2399       if (HasDst) {
2400         SelIdx--;
2401         ImmIdx--;
2402       }
2403       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2404       SDValue &Imm = Ops[ImmIdx];
2405       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2406         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2407     }
2408   }
2409 
2410   return Node;
2411 }
2412