lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
  34                                        const AMDGPUSubtarget &STI)
  35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties(STI.getRegisterInfo());
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // ADD, SUB overflow.
  95   // TODO: turn these into Legal?
  96   if (Subtarget->hasCARRY())
  97     setOperationAction(ISD::UADDO, MVT::i32, Custom);
  98
  99   if (Subtarget->hasBORROW())
 100     setOperationAction(ISD::USUBO, MVT::i32, Custom);
 101
 102   // Expand sign extension of vectors
 103   if (!Subtarget->hasBFE())
 104     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 105
 106   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
 107   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 108
 109   if (!Subtarget->hasBFE())
 110     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 113
 114   if (!Subtarget->hasBFE())
 115     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 116   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 117   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 118
 119   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 120   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 121   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 122
 123   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 124
 125
 126   // Legalize loads and stores to the private address space.
 127   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 128   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 129   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 130
 131   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 132   // spaces, so it is custom lowered to handle those where it isn't.
 133   for (MVT VT : MVT::integer_valuetypes()) {
 134     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 135     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
 136     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
 137
 138     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 139     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
 140     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
 141
 142     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 143     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
 144     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
 145   }
 146
 147   setOperationAction(ISD::STORE, MVT::i8, Custom);
 148   setOperationAction(ISD::STORE, MVT::i32, Custom);
 149   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 150   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 151   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 152   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 153
 154   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 155   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 156   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 157
 158   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 159   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 160   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 161   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 162
 163   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 164   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 165   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 166   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 167
 168   setTargetDAGCombine(ISD::FP_ROUND);
 169   setTargetDAGCombine(ISD::FP_TO_SINT);
 170   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 171   setTargetDAGCombine(ISD::SELECT_CC);
 172   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 173
 174   // These should be replaced by UDVIREM, but it does not happen automatically
 175   // during Type Legalization
 176   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 177   setOperationAction(ISD::UREM, MVT::i64, Custom);
 178   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 179   setOperationAction(ISD::SREM, MVT::i64, Custom);
 180
 181   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 182   //  to be Legal/Custom in order to avoid library calls.
 183   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 184   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 185   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 186
 187   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 188
 189   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 190   for (MVT VT : ScalarIntVTs) {
 191     setOperationAction(ISD::ADDC, VT, Expand);
 192     setOperationAction(ISD::SUBC, VT, Expand);
 193     setOperationAction(ISD::ADDE, VT, Expand);
 194     setOperationAction(ISD::SUBE, VT, Expand);
 195   }
 196
 197   setSchedulingPreference(Sched::Source);
 198 }
 199
 200 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 201     MachineInstr * MI, MachineBasicBlock * BB) const {
 202   MachineFunction * MF = BB->getParent();
 203   MachineRegisterInfo &MRI = MF->getRegInfo();
 204   MachineBasicBlock::iterator I = *MI;
 205   const R600InstrInfo *TII =
 206       static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 207
 208   switch (MI->getOpcode()) {
 209   default:
 210     // Replace LDS_*_RET instruction that don't have any uses with the
 211     // equivalent LDS_*_NORET instruction.
 212     if (TII->isLDSRetInstr(MI->getOpcode())) {
 213       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 214       assert(DstIdx != -1);
 215       MachineInstrBuilder NewMI;
 216       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 217       //        LDS_1A2D support and remove this special case.
 218       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
 219            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
 220         return BB;
 221
 222       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 223                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 224       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 225         NewMI.addOperand(MI->getOperand(i));
 226       }
 227     } else {
 228       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 229     }
 230     break;
 231   case AMDGPU::CLAMP_R600: {
 232     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 233                                                    AMDGPU::MOV,
 234                                                    MI->getOperand(0).getReg(),
 235                                                    MI->getOperand(1).getReg());
 236     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 237     break;
 238   }
 239
 240   case AMDGPU::FABS_R600: {
 241     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 242                                                     AMDGPU::MOV,
 243                                                     MI->getOperand(0).getReg(),
 244                                                     MI->getOperand(1).getReg());
 245     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 246     break;
 247   }
 248
 249   case AMDGPU::FNEG_R600: {
 250     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 251                                                     AMDGPU::MOV,
 252                                                     MI->getOperand(0).getReg(),
 253                                                     MI->getOperand(1).getReg());
 254     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 255     break;
 256   }
 257
 258   case AMDGPU::MASK_WRITE: {
 259     unsigned maskedRegister = MI->getOperand(0).getReg();
 260     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 261     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 262     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 263     break;
 264   }
 265
 266   case AMDGPU::MOV_IMM_F32:
 267     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 268                      MI->getOperand(1).getFPImm()->getValueAPF()
 269                          .bitcastToAPInt().getZExtValue());
 270     break;
 271   case AMDGPU::MOV_IMM_I32:
 272     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 273                      MI->getOperand(1).getImm());
 274     break;
 275   case AMDGPU::CONST_COPY: {
 276     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 277         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 278     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 279         MI->getOperand(1).getImm());
 280     break;
 281   }
 282
 283   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 284   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 285   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 286     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 287
 288     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 289             .addOperand(MI->getOperand(0))
 290             .addOperand(MI->getOperand(1))
 291             .addImm(EOP); // Set End of program bit
 292     break;
 293   }
 294
 295   case AMDGPU::TXD: {
 296     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 297     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 298     MachineOperand &RID = MI->getOperand(4);
 299     MachineOperand &SID = MI->getOperand(5);
 300     unsigned TextureId = MI->getOperand(6).getImm();
 301     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 302     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 303
 304     switch (TextureId) {
 305     case 5: // Rect
 306       CTX = CTY = 0;
 307       break;
 308     case 6: // Shadow1D
 309       SrcW = SrcZ;
 310       break;
 311     case 7: // Shadow2D
 312       SrcW = SrcZ;
 313       break;
 314     case 8: // ShadowRect
 315       CTX = CTY = 0;
 316       SrcW = SrcZ;
 317       break;
 318     case 9: // 1DArray
 319       SrcZ = SrcY;
 320       CTZ = 0;
 321       break;
 322     case 10: // 2DArray
 323       CTZ = 0;
 324       break;
 325     case 11: // Shadow1DArray
 326       SrcZ = SrcY;
 327       CTZ = 0;
 328       break;
 329     case 12: // Shadow2DArray
 330       CTZ = 0;
 331       break;
 332     }
 333     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 334             .addOperand(MI->getOperand(3))
 335             .addImm(SrcX)
 336             .addImm(SrcY)
 337             .addImm(SrcZ)
 338             .addImm(SrcW)
 339             .addImm(0)
 340             .addImm(0)
 341             .addImm(0)
 342             .addImm(0)
 343             .addImm(1)
 344             .addImm(2)
 345             .addImm(3)
 346             .addOperand(RID)
 347             .addOperand(SID)
 348             .addImm(CTX)
 349             .addImm(CTY)
 350             .addImm(CTZ)
 351             .addImm(CTW);
 352     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 353             .addOperand(MI->getOperand(2))
 354             .addImm(SrcX)
 355             .addImm(SrcY)
 356             .addImm(SrcZ)
 357             .addImm(SrcW)
 358             .addImm(0)
 359             .addImm(0)
 360             .addImm(0)
 361             .addImm(0)
 362             .addImm(1)
 363             .addImm(2)
 364             .addImm(3)
 365             .addOperand(RID)
 366             .addOperand(SID)
 367             .addImm(CTX)
 368             .addImm(CTY)
 369             .addImm(CTZ)
 370             .addImm(CTW);
 371     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 372             .addOperand(MI->getOperand(0))
 373             .addOperand(MI->getOperand(1))
 374             .addImm(SrcX)
 375             .addImm(SrcY)
 376             .addImm(SrcZ)
 377             .addImm(SrcW)
 378             .addImm(0)
 379             .addImm(0)
 380             .addImm(0)
 381             .addImm(0)
 382             .addImm(1)
 383             .addImm(2)
 384             .addImm(3)
 385             .addOperand(RID)
 386             .addOperand(SID)
 387             .addImm(CTX)
 388             .addImm(CTY)
 389             .addImm(CTZ)
 390             .addImm(CTW)
 391             .addReg(T0, RegState::Implicit)
 392             .addReg(T1, RegState::Implicit);
 393     break;
 394   }
 395
 396   case AMDGPU::TXD_SHADOW: {
 397     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 398     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 399     MachineOperand &RID = MI->getOperand(4);
 400     MachineOperand &SID = MI->getOperand(5);
 401     unsigned TextureId = MI->getOperand(6).getImm();
 402     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 403     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 404
 405     switch (TextureId) {
 406     case 5: // Rect
 407       CTX = CTY = 0;
 408       break;
 409     case 6: // Shadow1D
 410       SrcW = SrcZ;
 411       break;
 412     case 7: // Shadow2D
 413       SrcW = SrcZ;
 414       break;
 415     case 8: // ShadowRect
 416       CTX = CTY = 0;
 417       SrcW = SrcZ;
 418       break;
 419     case 9: // 1DArray
 420       SrcZ = SrcY;
 421       CTZ = 0;
 422       break;
 423     case 10: // 2DArray
 424       CTZ = 0;
 425       break;
 426     case 11: // Shadow1DArray
 427       SrcZ = SrcY;
 428       CTZ = 0;
 429       break;
 430     case 12: // Shadow2DArray
 431       CTZ = 0;
 432       break;
 433     }
 434
 435     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 436             .addOperand(MI->getOperand(3))
 437             .addImm(SrcX)
 438             .addImm(SrcY)
 439             .addImm(SrcZ)
 440             .addImm(SrcW)
 441             .addImm(0)
 442             .addImm(0)
 443             .addImm(0)
 444             .addImm(0)
 445             .addImm(1)
 446             .addImm(2)
 447             .addImm(3)
 448             .addOperand(RID)
 449             .addOperand(SID)
 450             .addImm(CTX)
 451             .addImm(CTY)
 452             .addImm(CTZ)
 453             .addImm(CTW);
 454     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 455             .addOperand(MI->getOperand(2))
 456             .addImm(SrcX)
 457             .addImm(SrcY)
 458             .addImm(SrcZ)
 459             .addImm(SrcW)
 460             .addImm(0)
 461             .addImm(0)
 462             .addImm(0)
 463             .addImm(0)
 464             .addImm(1)
 465             .addImm(2)
 466             .addImm(3)
 467             .addOperand(RID)
 468             .addOperand(SID)
 469             .addImm(CTX)
 470             .addImm(CTY)
 471             .addImm(CTZ)
 472             .addImm(CTW);
 473     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 474             .addOperand(MI->getOperand(0))
 475             .addOperand(MI->getOperand(1))
 476             .addImm(SrcX)
 477             .addImm(SrcY)
 478             .addImm(SrcZ)
 479             .addImm(SrcW)
 480             .addImm(0)
 481             .addImm(0)
 482             .addImm(0)
 483             .addImm(0)
 484             .addImm(1)
 485             .addImm(2)
 486             .addImm(3)
 487             .addOperand(RID)
 488             .addOperand(SID)
 489             .addImm(CTX)
 490             .addImm(CTY)
 491             .addImm(CTZ)
 492             .addImm(CTW)
 493             .addReg(T0, RegState::Implicit)
 494             .addReg(T1, RegState::Implicit);
 495     break;
 496   }
 497
 498   case AMDGPU::BRANCH:
 499       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 500               .addOperand(MI->getOperand(0));
 501       break;
 502
 503   case AMDGPU::BRANCH_COND_f32: {
 504     MachineInstr *NewMI =
 505       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 506               AMDGPU::PREDICATE_BIT)
 507               .addOperand(MI->getOperand(1))
 508               .addImm(OPCODE_IS_NOT_ZERO)
 509               .addImm(0); // Flags
 510     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 511     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 512             .addOperand(MI->getOperand(0))
 513             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 514     break;
 515   }
 516
 517   case AMDGPU::BRANCH_COND_i32: {
 518     MachineInstr *NewMI =
 519       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 520             AMDGPU::PREDICATE_BIT)
 521             .addOperand(MI->getOperand(1))
 522             .addImm(OPCODE_IS_NOT_ZERO_INT)
 523             .addImm(0); // Flags
 524     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 525     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 526            .addOperand(MI->getOperand(0))
 527             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 528     break;
 529   }
 530
 531   case AMDGPU::EG_ExportSwz:
 532   case AMDGPU::R600_ExportSwz: {
 533     // Instruction is left unmodified if its not the last one of its type
 534     bool isLastInstructionOfItsType = true;
 535     unsigned InstExportType = MI->getOperand(1).getImm();
 536     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 537          EndBlock = BB->end(); NextExportInst != EndBlock;
 538          NextExportInst = std::next(NextExportInst)) {
 539       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 540           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 541         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 542             .getImm();
 543         if (CurrentInstExportType == InstExportType) {
 544           isLastInstructionOfItsType = false;
 545           break;
 546         }
 547       }
 548     }
 549     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 550     if (!EOP && !isLastInstructionOfItsType)
 551       return BB;
 552     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 553     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 554             .addOperand(MI->getOperand(0))
 555             .addOperand(MI->getOperand(1))
 556             .addOperand(MI->getOperand(2))
 557             .addOperand(MI->getOperand(3))
 558             .addOperand(MI->getOperand(4))
 559             .addOperand(MI->getOperand(5))
 560             .addOperand(MI->getOperand(6))
 561             .addImm(CfInst)
 562             .addImm(EOP);
 563     break;
 564   }
 565   case AMDGPU::RETURN: {
 566     // RETURN instructions must have the live-out registers as implicit uses,
 567     // otherwise they appear dead.
 568     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 569     MachineInstrBuilder MIB(*MF, MI);
 570     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 571       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 572     return BB;
 573   }
 574   }
 575
 576   MI->eraseFromParent();
 577   return BB;
 578 }
 579
 580 //===----------------------------------------------------------------------===//
 581 // Custom DAG Lowering Operations
 582 //===----------------------------------------------------------------------===//
 583
 584 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 585   MachineFunction &MF = DAG.getMachineFunction();
 586   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 587   switch (Op.getOpcode()) {
 588   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 589   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 590   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 591   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 592   case ISD::SRA_PARTS:
 593   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 594   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
 595   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
 596   case ISD::FCOS:
 597   case ISD::FSIN: return LowerTrig(Op, DAG);
 598   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 599   case ISD::STORE: return LowerSTORE(Op, DAG);
 600   case ISD::LOAD: {
 601     SDValue Result = LowerLOAD(Op, DAG);
 602     assert((!Result.getNode() ||
 603             Result.getNode()->getNumValues() == 2) &&
 604            "Load should return a value and a chain");
 605     return Result;
 606   }
 607
 608   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 609   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 610   case ISD::INTRINSIC_VOID: {
 611     SDValue Chain = Op.getOperand(0);
 612     unsigned IntrinsicID =
 613                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 614     switch (IntrinsicID) {
 615     case AMDGPUIntrinsic::AMDGPU_store_output: {
 616       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 617       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 618       MFI->LiveOuts.push_back(Reg);
 619       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 620     }
 621     case AMDGPUIntrinsic::R600_store_swizzle: {
 622       SDLoc DL(Op);
 623       const SDValue Args[8] = {
 624         Chain,
 625         Op.getOperand(2), // Export Value
 626         Op.getOperand(3), // ArrayBase
 627         Op.getOperand(4), // Type
 628         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
 629         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
 630         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
 631         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
 632       };
 633       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
 634     }
 635
 636     // default for switch(IntrinsicID)
 637     default: break;
 638     }
 639     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 640     break;
 641   }
 642   case ISD::INTRINSIC_WO_CHAIN: {
 643     unsigned IntrinsicID =
 644                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 645     EVT VT = Op.getValueType();
 646     SDLoc DL(Op);
 647     switch(IntrinsicID) {
 648     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 649     case AMDGPUIntrinsic::R600_load_input: {
 650       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 651       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 652       MachineFunction &MF = DAG.getMachineFunction();
 653       MachineRegisterInfo &MRI = MF.getRegInfo();
 654       MRI.addLiveIn(Reg);
 655       return DAG.getCopyFromReg(DAG.getEntryNode(),
 656           SDLoc(DAG.getEntryNode()), Reg, VT);
 657     }
 658
 659     case AMDGPUIntrinsic::R600_interp_input: {
 660       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 661       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 662       MachineSDNode *interp;
 663       if (ijb < 0) {
 664         const R600InstrInfo *TII =
 665             static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 666         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 667             MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
 668         return DAG.getTargetExtractSubreg(
 669             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 670             DL, MVT::f32, SDValue(interp, 0));
 671       }
 672       MachineFunction &MF = DAG.getMachineFunction();
 673       MachineRegisterInfo &MRI = MF.getRegInfo();
 674       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 675       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 676       MRI.addLiveIn(RegisterI);
 677       MRI.addLiveIn(RegisterJ);
 678       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 679           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 680       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 681           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 682
 683       if (slot % 4 < 2)
 684         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 685             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
 686             RegisterJNode, RegisterINode);
 687       else
 688         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 689             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
 690             RegisterJNode, RegisterINode);
 691       return SDValue(interp, slot % 2);
 692     }
 693     case AMDGPUIntrinsic::R600_interp_xy:
 694     case AMDGPUIntrinsic::R600_interp_zw: {
 695       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 696       MachineSDNode *interp;
 697       SDValue RegisterINode = Op.getOperand(2);
 698       SDValue RegisterJNode = Op.getOperand(3);
 699
 700       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 701         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 702             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
 703             RegisterJNode, RegisterINode);
 704       else
 705         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 706             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
 707             RegisterJNode, RegisterINode);
 708       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 709           SDValue(interp, 0), SDValue(interp, 1));
 710     }
 711     case AMDGPUIntrinsic::R600_tex:
 712     case AMDGPUIntrinsic::R600_texc:
 713     case AMDGPUIntrinsic::R600_txl:
 714     case AMDGPUIntrinsic::R600_txlc:
 715     case AMDGPUIntrinsic::R600_txb:
 716     case AMDGPUIntrinsic::R600_txbc:
 717     case AMDGPUIntrinsic::R600_txf:
 718     case AMDGPUIntrinsic::R600_txq:
 719     case AMDGPUIntrinsic::R600_ddx:
 720     case AMDGPUIntrinsic::R600_ddy:
 721     case AMDGPUIntrinsic::R600_ldptr: {
 722       unsigned TextureOp;
 723       switch (IntrinsicID) {
 724       case AMDGPUIntrinsic::R600_tex:
 725         TextureOp = 0;
 726         break;
 727       case AMDGPUIntrinsic::R600_texc:
 728         TextureOp = 1;
 729         break;
 730       case AMDGPUIntrinsic::R600_txl:
 731         TextureOp = 2;
 732         break;
 733       case AMDGPUIntrinsic::R600_txlc:
 734         TextureOp = 3;
 735         break;
 736       case AMDGPUIntrinsic::R600_txb:
 737         TextureOp = 4;
 738         break;
 739       case AMDGPUIntrinsic::R600_txbc:
 740         TextureOp = 5;
 741         break;
 742       case AMDGPUIntrinsic::R600_txf:
 743         TextureOp = 6;
 744         break;
 745       case AMDGPUIntrinsic::R600_txq:
 746         TextureOp = 7;
 747         break;
 748       case AMDGPUIntrinsic::R600_ddx:
 749         TextureOp = 8;
 750         break;
 751       case AMDGPUIntrinsic::R600_ddy:
 752         TextureOp = 9;
 753         break;
 754       case AMDGPUIntrinsic::R600_ldptr:
 755         TextureOp = 10;
 756         break;
 757       default:
 758         llvm_unreachable("Unknow Texture Operation");
 759       }
 760
 761       SDValue TexArgs[19] = {
 762         DAG.getConstant(TextureOp, DL, MVT::i32),
 763         Op.getOperand(1),
 764         DAG.getConstant(0, DL, MVT::i32),
 765         DAG.getConstant(1, DL, MVT::i32),
 766         DAG.getConstant(2, DL, MVT::i32),
 767         DAG.getConstant(3, DL, MVT::i32),
 768         Op.getOperand(2),
 769         Op.getOperand(3),
 770         Op.getOperand(4),
 771         DAG.getConstant(0, DL, MVT::i32),
 772         DAG.getConstant(1, DL, MVT::i32),
 773         DAG.getConstant(2, DL, MVT::i32),
 774         DAG.getConstant(3, DL, MVT::i32),
 775         Op.getOperand(5),
 776         Op.getOperand(6),
 777         Op.getOperand(7),
 778         Op.getOperand(8),
 779         Op.getOperand(9),
 780         Op.getOperand(10)
 781       };
 782       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 783     }
 784     case AMDGPUIntrinsic::AMDGPU_dp4: {
 785       SDValue Args[8] = {
 786       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 787           DAG.getConstant(0, DL, MVT::i32)),
 788       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 789           DAG.getConstant(0, DL, MVT::i32)),
 790       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 791           DAG.getConstant(1, DL, MVT::i32)),
 792       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 793           DAG.getConstant(1, DL, MVT::i32)),
 794       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 795           DAG.getConstant(2, DL, MVT::i32)),
 796       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 797           DAG.getConstant(2, DL, MVT::i32)),
 798       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 799           DAG.getConstant(3, DL, MVT::i32)),
 800       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 801           DAG.getConstant(3, DL, MVT::i32))
 802       };
 803       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 804     }
 805
 806     case Intrinsic::r600_read_ngroups_x:
 807       return LowerImplicitParameter(DAG, VT, DL, 0);
 808     case Intrinsic::r600_read_ngroups_y:
 809       return LowerImplicitParameter(DAG, VT, DL, 1);
 810     case Intrinsic::r600_read_ngroups_z:
 811       return LowerImplicitParameter(DAG, VT, DL, 2);
 812     case Intrinsic::r600_read_global_size_x:
 813       return LowerImplicitParameter(DAG, VT, DL, 3);
 814     case Intrinsic::r600_read_global_size_y:
 815       return LowerImplicitParameter(DAG, VT, DL, 4);
 816     case Intrinsic::r600_read_global_size_z:
 817       return LowerImplicitParameter(DAG, VT, DL, 5);
 818     case Intrinsic::r600_read_local_size_x:
 819       return LowerImplicitParameter(DAG, VT, DL, 6);
 820     case Intrinsic::r600_read_local_size_y:
 821       return LowerImplicitParameter(DAG, VT, DL, 7);
 822     case Intrinsic::r600_read_local_size_z:
 823       return LowerImplicitParameter(DAG, VT, DL, 8);
 824
 825     case Intrinsic::AMDGPU_read_workdim:
 826       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
 827
 828     case Intrinsic::r600_read_tgid_x:
 829       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 830                                   AMDGPU::T1_X, VT);
 831     case Intrinsic::r600_read_tgid_y:
 832       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 833                                   AMDGPU::T1_Y, VT);
 834     case Intrinsic::r600_read_tgid_z:
 835       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 836                                   AMDGPU::T1_Z, VT);
 837     case Intrinsic::r600_read_tidig_x:
 838       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 839                                   AMDGPU::T0_X, VT);
 840     case Intrinsic::r600_read_tidig_y:
 841       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 842                                   AMDGPU::T0_Y, VT);
 843     case Intrinsic::r600_read_tidig_z:
 844       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 845                                   AMDGPU::T0_Z, VT);
 846     case Intrinsic::AMDGPU_rsq:
 847       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 848       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 849
 850     case AMDGPUIntrinsic::AMDGPU_fract:
 851     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
 852       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
 853     }
 854     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 855     break;
 856   }
 857   } // end switch(Op.getOpcode())
 858   return SDValue();
 859 }
 860
 861 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 862                                             SmallVectorImpl<SDValue> &Results,
 863                                             SelectionDAG &DAG) const {
 864   switch (N->getOpcode()) {
 865   default:
 866     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 867     return;
 868   case ISD::FP_TO_UINT:
 869     if (N->getValueType(0) == MVT::i1) {
 870       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 871       return;
 872     }
 873     // Fall-through. Since we don't care about out of bounds values
 874     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 875     // considers some extra cases which are not necessary here.
 876   case ISD::FP_TO_SINT: {
 877     SDValue Result;
 878     if (expandFP_TO_SINT(N, Result, DAG))
 879       Results.push_back(Result);
 880     return;
 881   }
 882   case ISD::UDIV: {
 883     SDValue Op = SDValue(N, 0);
 884     SDLoc DL(Op);
 885     EVT VT = Op.getValueType();
 886     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 887       N->getOperand(0), N->getOperand(1));
 888     Results.push_back(UDIVREM);
 889     break;
 890   }
 891   case ISD::UREM: {
 892     SDValue Op = SDValue(N, 0);
 893     SDLoc DL(Op);
 894     EVT VT = Op.getValueType();
 895     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 896       N->getOperand(0), N->getOperand(1));
 897     Results.push_back(UDIVREM.getValue(1));
 898     break;
 899   }
 900   case ISD::SDIV: {
 901     SDValue Op = SDValue(N, 0);
 902     SDLoc DL(Op);
 903     EVT VT = Op.getValueType();
 904     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 905       N->getOperand(0), N->getOperand(1));
 906     Results.push_back(SDIVREM);
 907     break;
 908   }
 909   case ISD::SREM: {
 910     SDValue Op = SDValue(N, 0);
 911     SDLoc DL(Op);
 912     EVT VT = Op.getValueType();
 913     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 914       N->getOperand(0), N->getOperand(1));
 915     Results.push_back(SDIVREM.getValue(1));
 916     break;
 917   }
 918   case ISD::SDIVREM: {
 919     SDValue Op = SDValue(N, 1);
 920     SDValue RES = LowerSDIVREM(Op, DAG);
 921     Results.push_back(RES);
 922     Results.push_back(RES.getValue(1));
 923     break;
 924   }
 925   case ISD::UDIVREM: {
 926     SDValue Op = SDValue(N, 0);
 927     LowerUDIVREM64(Op, DAG, Results);
 928     break;
 929   }
 930   }
 931 }
 932
 933 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 934                                                    SDValue Vector) const {
 935
 936   SDLoc DL(Vector);
 937   EVT VecVT = Vector.getValueType();
 938   EVT EltVT = VecVT.getVectorElementType();
 939   SmallVector<SDValue, 8> Args;
 940
 941   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 942                                                            i != e; ++i) {
 943     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
 944                                DAG.getConstant(i, DL, getVectorIdxTy())));
 945   }
 946
 947   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 948 }
 949
 950 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 951                                                     SelectionDAG &DAG) const {
 952
 953   SDLoc DL(Op);
 954   SDValue Vector = Op.getOperand(0);
 955   SDValue Index = Op.getOperand(1);
 956
 957   if (isa<ConstantSDNode>(Index) ||
 958       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 959     return Op;
 960
 961   Vector = vectorToVerticalVector(DAG, Vector);
 962   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 963                      Vector, Index);
 964 }
 965
 966 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 967                                                    SelectionDAG &DAG) const {
 968   SDLoc DL(Op);
 969   SDValue Vector = Op.getOperand(0);
 970   SDValue Value = Op.getOperand(1);
 971   SDValue Index = Op.getOperand(2);
 972
 973   if (isa<ConstantSDNode>(Index) ||
 974       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 975     return Op;
 976
 977   Vector = vectorToVerticalVector(DAG, Vector);
 978   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 979                                Vector, Value, Index);
 980   return vectorToVerticalVector(DAG, Insert);
 981 }
 982
 983 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 984   // On hw >= R700, COS/SIN input must be between -1. and 1.
 985   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 986   EVT VT = Op.getValueType();
 987   SDValue Arg = Op.getOperand(0);
 988   SDLoc DL(Op);
 989   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
 990       DAG.getNode(ISD::FADD, DL, VT,
 991         DAG.getNode(ISD::FMUL, DL, VT, Arg,
 992           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
 993         DAG.getConstantFP(0.5, DL, MVT::f32)));
 994   unsigned TrigNode;
 995   switch (Op.getOpcode()) {
 996   case ISD::FCOS:
 997     TrigNode = AMDGPUISD::COS_HW;
 998     break;
 999   case ISD::FSIN:
1000     TrigNode = AMDGPUISD::SIN_HW;
1001     break;
1002   default:
1003     llvm_unreachable("Wrong trig opcode");
1004   }
1005   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
1006       DAG.getNode(ISD::FADD, DL, VT, FractPart,
1007         DAG.getConstantFP(-0.5, DL, MVT::f32)));
1008   if (Gen >= AMDGPUSubtarget::R700)
1009     return TrigVal;
1010   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1011   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
1012       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
1013 }
1014
1015 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1016   SDLoc DL(Op);
1017   EVT VT = Op.getValueType();
1018
1019   SDValue Lo = Op.getOperand(0);
1020   SDValue Hi = Op.getOperand(1);
1021   SDValue Shift = Op.getOperand(2);
1022   SDValue Zero = DAG.getConstant(0, DL, VT);
1023   SDValue One  = DAG.getConstant(1, DL, VT);
1024
1025   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1026   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1027   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1028   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1029
1030   // The dance around Width1 is necessary for 0 special case.
1031   // Without it the CompShift might be 32, producing incorrect results in
1032   // Overflow. So we do the shift in two steps, the alternative is to
1033   // add a conditional to filter the special case.
1034
1035   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1036   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1037
1038   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1039   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1040   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1041
1042   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1043   SDValue LoBig = Zero;
1044
1045   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1046   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1047
1048   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1049 }
1050
1051 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1052   SDLoc DL(Op);
1053   EVT VT = Op.getValueType();
1054
1055   SDValue Lo = Op.getOperand(0);
1056   SDValue Hi = Op.getOperand(1);
1057   SDValue Shift = Op.getOperand(2);
1058   SDValue Zero = DAG.getConstant(0, DL, VT);
1059   SDValue One  = DAG.getConstant(1, DL, VT);
1060
1061   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1062
1063   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1064   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1065   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1066   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1067
1068   // The dance around Width1 is necessary for 0 special case.
1069   // Without it the CompShift might be 32, producing incorrect results in
1070   // Overflow. So we do the shift in two steps, the alternative is to
1071   // add a conditional to filter the special case.
1072
1073   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1074   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1075
1076   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1077   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1078   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1079
1080   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1081   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1082
1083   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1084   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1085
1086   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1087 }
1088
1089 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1090                                           unsigned mainop, unsigned ovf) const {
1091   SDLoc DL(Op);
1092   EVT VT = Op.getValueType();
1093
1094   SDValue Lo = Op.getOperand(0);
1095   SDValue Hi = Op.getOperand(1);
1096
1097   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1098   // Extend sign.
1099   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1100                     DAG.getValueType(MVT::i1));
1101
1102   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1103
1104   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1105 }
1106
1107 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1108   SDLoc DL(Op);
1109   return DAG.getNode(
1110       ISD::SETCC,
1111       DL,
1112       MVT::i1,
1113       Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
1114       DAG.getCondCode(ISD::SETNE)
1115       );
1116 }
1117
1118 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1119                                                    SDLoc DL,
1120                                                    unsigned DwordOffset) const {
1121   unsigned ByteOffset = DwordOffset * 4;
1122   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1123                                       AMDGPUAS::CONSTANT_BUFFER_0);
1124
1125   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1126   assert(isInt<16>(ByteOffset));
1127
1128   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1129                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1130                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1131                      false, false, false, 0);
1132 }
1133
1134 bool R600TargetLowering::isZero(SDValue Op) const {
1135   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1136     return Cst->isNullValue();
1137   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1138     return CstFP->isZero();
1139   } else {
1140     return false;
1141   }
1142 }
1143
1144 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1145   SDLoc DL(Op);
1146   EVT VT = Op.getValueType();
1147
1148   SDValue LHS = Op.getOperand(0);
1149   SDValue RHS = Op.getOperand(1);
1150   SDValue True = Op.getOperand(2);
1151   SDValue False = Op.getOperand(3);
1152   SDValue CC = Op.getOperand(4);
1153   SDValue Temp;
1154
1155   if (VT == MVT::f32) {
1156     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1157     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1158     if (MinMax)
1159       return MinMax;
1160   }
1161
1162   // LHS and RHS are guaranteed to be the same value type
1163   EVT CompareVT = LHS.getValueType();
1164
1165   // Check if we can lower this to a native operation.
1166
1167   // Try to lower to a SET* instruction:
1168   //
1169   // SET* can match the following patterns:
1170   //
1171   // select_cc f32, f32, -1,  0, cc_supported
1172   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1173   // select_cc i32, i32, -1,  0, cc_supported
1174   //
1175
1176   // Move hardware True/False values to the correct operand.
1177   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1178   ISD::CondCode InverseCC =
1179      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1180   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1181     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1182       std::swap(False, True);
1183       CC = DAG.getCondCode(InverseCC);
1184     } else {
1185       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1186       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1187         std::swap(False, True);
1188         std::swap(LHS, RHS);
1189         CC = DAG.getCondCode(SwapInvCC);
1190       }
1191     }
1192   }
1193
1194   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1195       (CompareVT == VT || VT == MVT::i32)) {
1196     // This can be matched by a SET* instruction.
1197     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1198   }
1199
1200   // Try to lower to a CND* instruction:
1201   //
1202   // CND* can match the following patterns:
1203   //
1204   // select_cc f32, 0.0, f32, f32, cc_supported
1205   // select_cc f32, 0.0, i32, i32, cc_supported
1206   // select_cc i32, 0,   f32, f32, cc_supported
1207   // select_cc i32, 0,   i32, i32, cc_supported
1208   //
1209
1210   // Try to move the zero value to the RHS
1211   if (isZero(LHS)) {
1212     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1213     // Try swapping the operands
1214     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1215     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1216       std::swap(LHS, RHS);
1217       CC = DAG.getCondCode(CCSwapped);
1218     } else {
1219       // Try inverting the conditon and then swapping the operands
1220       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1221       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1222       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1223         std::swap(True, False);
1224         std::swap(LHS, RHS);
1225         CC = DAG.getCondCode(CCSwapped);
1226       }
1227     }
1228   }
1229   if (isZero(RHS)) {
1230     SDValue Cond = LHS;
1231     SDValue Zero = RHS;
1232     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1233     if (CompareVT != VT) {
1234       // Bitcast True / False to the correct types.  This will end up being
1235       // a nop, but it allows us to define only a single pattern in the
1236       // .TD files for each CND* instruction rather than having to have
1237       // one pattern for integer True/False and one for fp True/False
1238       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1239       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1240     }
1241
1242     switch (CCOpcode) {
1243     case ISD::SETONE:
1244     case ISD::SETUNE:
1245     case ISD::SETNE:
1246       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1247       Temp = True;
1248       True = False;
1249       False = Temp;
1250       break;
1251     default:
1252       break;
1253     }
1254     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1255         Cond, Zero,
1256         True, False,
1257         DAG.getCondCode(CCOpcode));
1258     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1259   }
1260
1261   // If we make it this for it means we have no native instructions to handle
1262   // this SELECT_CC, so we must lower it.
1263   SDValue HWTrue, HWFalse;
1264
1265   if (CompareVT == MVT::f32) {
1266     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1267     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1268   } else if (CompareVT == MVT::i32) {
1269     HWTrue = DAG.getConstant(-1, DL, CompareVT);
1270     HWFalse = DAG.getConstant(0, DL, CompareVT);
1271   }
1272   else {
1273     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1274   }
1275
1276   // Lower this unsupported SELECT_CC into a combination of two supported
1277   // SELECT_CC operations.
1278   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1279
1280   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1281       Cond, HWFalse,
1282       True, False,
1283       DAG.getCondCode(ISD::SETNE));
1284 }
1285
1286 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1287 /// convert these pointers to a register index.  Each register holds
1288 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1289 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1290 /// for indirect addressing.
1291 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1292                                                unsigned StackWidth,
1293                                                SelectionDAG &DAG) const {
1294   unsigned SRLPad;
1295   switch(StackWidth) {
1296   case 1:
1297     SRLPad = 2;
1298     break;
1299   case 2:
1300     SRLPad = 3;
1301     break;
1302   case 4:
1303     SRLPad = 4;
1304     break;
1305   default: llvm_unreachable("Invalid stack width");
1306   }
1307
1308   SDLoc DL(Ptr);
1309   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1310                      DAG.getConstant(SRLPad, DL, MVT::i32));
1311 }
1312
1313 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1314                                          unsigned ElemIdx,
1315                                          unsigned &Channel,
1316                                          unsigned &PtrIncr) const {
1317   switch (StackWidth) {
1318   default:
1319   case 1:
1320     Channel = 0;
1321     if (ElemIdx > 0) {
1322       PtrIncr = 1;
1323     } else {
1324       PtrIncr = 0;
1325     }
1326     break;
1327   case 2:
1328     Channel = ElemIdx % 2;
1329     if (ElemIdx == 2) {
1330       PtrIncr = 1;
1331     } else {
1332       PtrIncr = 0;
1333     }
1334     break;
1335   case 4:
1336     Channel = ElemIdx;
1337     PtrIncr = 0;
1338     break;
1339   }
1340 }
1341
1342 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1343   SDLoc DL(Op);
1344   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1345   SDValue Chain = Op.getOperand(0);
1346   SDValue Value = Op.getOperand(1);
1347   SDValue Ptr = Op.getOperand(2);
1348
1349   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1350   if (Result.getNode()) {
1351     return Result;
1352   }
1353
1354   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1355     if (StoreNode->isTruncatingStore()) {
1356       EVT VT = Value.getValueType();
1357       assert(VT.bitsLE(MVT::i32));
1358       EVT MemVT = StoreNode->getMemoryVT();
1359       SDValue MaskConstant;
1360       if (MemVT == MVT::i8) {
1361         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1362       } else {
1363         assert(MemVT == MVT::i16);
1364         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1365       }
1366       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1367                                       DAG.getConstant(2, DL, MVT::i32));
1368       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1369                                       DAG.getConstant(0x00000003, DL, VT));
1370       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1371       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1372                                    DAG.getConstant(3, DL, VT));
1373       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1374       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1375       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1376       // vector instead.
1377       SDValue Src[4] = {
1378         ShiftedValue,
1379         DAG.getConstant(0, DL, MVT::i32),
1380         DAG.getConstant(0, DL, MVT::i32),
1381         Mask
1382       };
1383       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1384       SDValue Args[3] = { Chain, Input, DWordAddr };
1385       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1386                                      Op->getVTList(), Args, MemVT,
1387                                      StoreNode->getMemOperand());
1388     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1389                Value.getValueType().bitsGE(MVT::i32)) {
1390       // Convert pointer from byte address to dword address.
1391       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1392                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1393                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
1394
1395       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1396         llvm_unreachable("Truncated and indexed stores not supported yet");
1397       } else {
1398         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1399       }
1400       return Chain;
1401     }
1402   }
1403
1404   EVT ValueVT = Value.getValueType();
1405
1406   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1407     return SDValue();
1408   }
1409
1410   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1411   if (Ret.getNode()) {
1412     return Ret;
1413   }
1414   // Lowering for indirect addressing
1415
1416   const MachineFunction &MF = DAG.getMachineFunction();
1417   const AMDGPUFrameLowering *TFL =
1418       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1419   unsigned StackWidth = TFL->getStackWidth(MF);
1420
1421   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1422
1423   if (ValueVT.isVector()) {
1424     unsigned NumElemVT = ValueVT.getVectorNumElements();
1425     EVT ElemVT = ValueVT.getVectorElementType();
1426     SmallVector<SDValue, 4> Stores(NumElemVT);
1427
1428     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1429                                       "vector width in load");
1430
1431     for (unsigned i = 0; i < NumElemVT; ++i) {
1432       unsigned Channel, PtrIncr;
1433       getStackAddress(StackWidth, i, Channel, PtrIncr);
1434       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1435                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1436       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1437                                  Value, DAG.getConstant(i, DL, MVT::i32));
1438
1439       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1440                               Chain, Elem, Ptr,
1441                               DAG.getTargetConstant(Channel, DL, MVT::i32));
1442     }
1443      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1444    } else {
1445     if (ValueVT == MVT::i8) {
1446       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1447     }
1448     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1449     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1450   }
1451
1452   return Chain;
1453 }
1454
1455 // return (512 + (kc_bank << 12)
1456 static int
1457 ConstantAddressBlock(unsigned AddressSpace) {
1458   switch (AddressSpace) {
1459   case AMDGPUAS::CONSTANT_BUFFER_0:
1460     return 512;
1461   case AMDGPUAS::CONSTANT_BUFFER_1:
1462     return 512 + 4096;
1463   case AMDGPUAS::CONSTANT_BUFFER_2:
1464     return 512 + 4096 * 2;
1465   case AMDGPUAS::CONSTANT_BUFFER_3:
1466     return 512 + 4096 * 3;
1467   case AMDGPUAS::CONSTANT_BUFFER_4:
1468     return 512 + 4096 * 4;
1469   case AMDGPUAS::CONSTANT_BUFFER_5:
1470     return 512 + 4096 * 5;
1471   case AMDGPUAS::CONSTANT_BUFFER_6:
1472     return 512 + 4096 * 6;
1473   case AMDGPUAS::CONSTANT_BUFFER_7:
1474     return 512 + 4096 * 7;
1475   case AMDGPUAS::CONSTANT_BUFFER_8:
1476     return 512 + 4096 * 8;
1477   case AMDGPUAS::CONSTANT_BUFFER_9:
1478     return 512 + 4096 * 9;
1479   case AMDGPUAS::CONSTANT_BUFFER_10:
1480     return 512 + 4096 * 10;
1481   case AMDGPUAS::CONSTANT_BUFFER_11:
1482     return 512 + 4096 * 11;
1483   case AMDGPUAS::CONSTANT_BUFFER_12:
1484     return 512 + 4096 * 12;
1485   case AMDGPUAS::CONSTANT_BUFFER_13:
1486     return 512 + 4096 * 13;
1487   case AMDGPUAS::CONSTANT_BUFFER_14:
1488     return 512 + 4096 * 14;
1489   case AMDGPUAS::CONSTANT_BUFFER_15:
1490     return 512 + 4096 * 15;
1491   default:
1492     return -1;
1493   }
1494 }
1495
1496 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1497 {
1498   EVT VT = Op.getValueType();
1499   SDLoc DL(Op);
1500   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1501   SDValue Chain = Op.getOperand(0);
1502   SDValue Ptr = Op.getOperand(1);
1503   SDValue LoweredLoad;
1504
1505   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1506   if (Ret.getNode()) {
1507     SDValue Ops[2] = {
1508       Ret,
1509       Chain
1510     };
1511     return DAG.getMergeValues(Ops, DL);
1512   }
1513
1514   // Lower loads constant address space global variable loads
1515   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1516       isa<GlobalVariable>(GetUnderlyingObject(
1517           LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
1518
1519     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1520         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1521     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1522         DAG.getConstant(2, DL, MVT::i32));
1523     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1524                        LoadNode->getChain(), Ptr,
1525                        DAG.getTargetConstant(0, DL, MVT::i32),
1526                        Op.getOperand(2));
1527   }
1528
1529   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1530     SDValue MergedValues[2] = {
1531       ScalarizeVectorLoad(Op, DAG),
1532       Chain
1533     };
1534     return DAG.getMergeValues(MergedValues, DL);
1535   }
1536
1537   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1538   if (ConstantBlock > -1 &&
1539       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1540        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1541     SDValue Result;
1542     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1543         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1544         isa<ConstantSDNode>(Ptr)) {
1545       SDValue Slots[4];
1546       for (unsigned i = 0; i < 4; i++) {
1547         // We want Const position encoded with the following formula :
1548         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1549         // const_index is Ptr computed by llvm using an alignment of 16.
1550         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1551         // then div by 4 at the ISel step
1552         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1553             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1554         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1555       }
1556       EVT NewVT = MVT::v4i32;
1557       unsigned NumElements = 4;
1558       if (VT.isVector()) {
1559         NewVT = VT;
1560         NumElements = VT.getVectorNumElements();
1561       }
1562       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1563                            makeArrayRef(Slots, NumElements));
1564     } else {
1565       // non-constant ptr can't be folded, keeps it as a v4f32 load
1566       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1567           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1568                       DAG.getConstant(4, DL, MVT::i32)),
1569                       DAG.getConstant(LoadNode->getAddressSpace() -
1570                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1571           );
1572     }
1573
1574     if (!VT.isVector()) {
1575       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1576                            DAG.getConstant(0, DL, MVT::i32));
1577     }
1578
1579     SDValue MergedValues[2] = {
1580       Result,
1581       Chain
1582     };
1583     return DAG.getMergeValues(MergedValues, DL);
1584   }
1585
1586   // For most operations returning SDValue() will result in the node being
1587   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1588   // need to manually expand loads that may be legal in some address spaces and
1589   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1590   // compute shaders, since the data is sign extended when it is uploaded to the
1591   // buffer. However SEXT loads from other address spaces are not supported, so
1592   // we need to expand them here.
1593   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1594     EVT MemVT = LoadNode->getMemoryVT();
1595     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1596     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1597                                   LoadNode->getPointerInfo(), MemVT,
1598                                   LoadNode->isVolatile(),
1599                                   LoadNode->isNonTemporal(),
1600                                   LoadNode->isInvariant(),
1601                                   LoadNode->getAlignment());
1602     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1603                               DAG.getValueType(MemVT));
1604
1605     SDValue MergedValues[2] = { Res, Chain };
1606     return DAG.getMergeValues(MergedValues, DL);
1607   }
1608
1609   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1610     return SDValue();
1611   }
1612
1613   // Lowering for indirect addressing
1614   const MachineFunction &MF = DAG.getMachineFunction();
1615   const AMDGPUFrameLowering *TFL =
1616       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1617   unsigned StackWidth = TFL->getStackWidth(MF);
1618
1619   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1620
1621   if (VT.isVector()) {
1622     unsigned NumElemVT = VT.getVectorNumElements();
1623     EVT ElemVT = VT.getVectorElementType();
1624     SDValue Loads[4];
1625
1626     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1627                                       "vector width in load");
1628
1629     for (unsigned i = 0; i < NumElemVT; ++i) {
1630       unsigned Channel, PtrIncr;
1631       getStackAddress(StackWidth, i, Channel, PtrIncr);
1632       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1633                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1634       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1635                              Chain, Ptr,
1636                              DAG.getTargetConstant(Channel, DL, MVT::i32),
1637                              Op.getOperand(2));
1638     }
1639     for (unsigned i = NumElemVT; i < 4; ++i) {
1640       Loads[i] = DAG.getUNDEF(ElemVT);
1641     }
1642     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1643     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1644   } else {
1645     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1646                               Chain, Ptr,
1647                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1648                               Op.getOperand(2));
1649   }
1650
1651   SDValue Ops[2] = {
1652     LoweredLoad,
1653     Chain
1654   };
1655
1656   return DAG.getMergeValues(Ops, DL);
1657 }
1658
1659 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1660   SDValue Chain = Op.getOperand(0);
1661   SDValue Cond  = Op.getOperand(1);
1662   SDValue Jump  = Op.getOperand(2);
1663
1664   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1665                      Chain, Jump, Cond);
1666 }
1667
1668 /// XXX Only kernel functions are supported, so we can assume for now that
1669 /// every function is a kernel function, but in the future we should use
1670 /// separate calling conventions for kernel and non-kernel functions.
1671 SDValue R600TargetLowering::LowerFormalArguments(
1672                                       SDValue Chain,
1673                                       CallingConv::ID CallConv,
1674                                       bool isVarArg,
1675                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1676                                       SDLoc DL, SelectionDAG &DAG,
1677                                       SmallVectorImpl<SDValue> &InVals) const {
1678   SmallVector<CCValAssign, 16> ArgLocs;
1679   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1680                  *DAG.getContext());
1681   MachineFunction &MF = DAG.getMachineFunction();
1682   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1683
1684   SmallVector<ISD::InputArg, 8> LocalIns;
1685
1686   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1687
1688   AnalyzeFormalArguments(CCInfo, LocalIns);
1689
1690   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1691     CCValAssign &VA = ArgLocs[i];
1692     const ISD::InputArg &In = Ins[i];
1693     EVT VT = In.VT;
1694     EVT MemVT = VA.getLocVT();
1695     if (!VT.isVector() && MemVT.isVector()) {
1696       // Get load source type if scalarized.
1697       MemVT = MemVT.getVectorElementType();
1698     }
1699
1700     if (MFI->getShaderType() != ShaderType::COMPUTE) {
1701       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1702       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1703       InVals.push_back(Register);
1704       continue;
1705     }
1706
1707     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1708                                           AMDGPUAS::CONSTANT_BUFFER_0);
1709
1710     // i64 isn't a legal type, so the register type used ends up as i32, which
1711     // isn't expected here. It attempts to create this sextload, but it ends up
1712     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1713     // for <1 x i64>.
1714
1715     // The first 36 bytes of the input buffer contains information about
1716     // thread group and global sizes.
1717     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1718     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1719       // FIXME: This should really check the extload type, but the handling of
1720       // extload vector parameters seems to be broken.
1721
1722       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1723       Ext = ISD::SEXTLOAD;
1724     }
1725
1726     // Compute the offset from the value.
1727     // XXX - I think PartOffset should give you this, but it seems to give the
1728     // size of the register which isn't useful.
1729
1730     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1731     unsigned PartOffset = VA.getLocMemOffset();
1732     unsigned Offset = 36 + VA.getLocMemOffset();
1733
1734     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1735     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1736                               DAG.getConstant(Offset, DL, MVT::i32),
1737                               DAG.getUNDEF(MVT::i32),
1738                               PtrInfo,
1739                               MemVT, false, true, true, 4);
1740
1741     // 4 is the preferred alignment for the CONSTANT memory space.
1742     InVals.push_back(Arg);
1743     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1744   }
1745   return Chain;
1746 }
1747
1748 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1749    if (!VT.isVector())
1750      return MVT::i32;
1751    return VT.changeVectorElementTypeToInteger();
1752 }
1753
1754 static SDValue CompactSwizzlableVector(
1755   SelectionDAG &DAG, SDValue VectorEntry,
1756   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1757   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1758   assert(RemapSwizzle.empty());
1759   SDValue NewBldVec[4] = {
1760     VectorEntry.getOperand(0),
1761     VectorEntry.getOperand(1),
1762     VectorEntry.getOperand(2),
1763     VectorEntry.getOperand(3)
1764   };
1765
1766   for (unsigned i = 0; i < 4; i++) {
1767     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1768       // We mask write here to teach later passes that the ith element of this
1769       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1770       // break false dependencies and additionnaly make assembly easier to read.
1771       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1772     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1773       if (C->isZero()) {
1774         RemapSwizzle[i] = 4; // SEL_0
1775         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1776       } else if (C->isExactlyValue(1.0)) {
1777         RemapSwizzle[i] = 5; // SEL_1
1778         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1779       }
1780     }
1781
1782     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1783       continue;
1784     for (unsigned j = 0; j < i; j++) {
1785       if (NewBldVec[i] == NewBldVec[j]) {
1786         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1787         RemapSwizzle[i] = j;
1788         break;
1789       }
1790     }
1791   }
1792
1793   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1794                      VectorEntry.getValueType(), NewBldVec);
1795 }
1796
1797 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1798                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1799   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1800   assert(RemapSwizzle.empty());
1801   SDValue NewBldVec[4] = {
1802       VectorEntry.getOperand(0),
1803       VectorEntry.getOperand(1),
1804       VectorEntry.getOperand(2),
1805       VectorEntry.getOperand(3)
1806   };
1807   bool isUnmovable[4] = { false, false, false, false };
1808   for (unsigned i = 0; i < 4; i++) {
1809     RemapSwizzle[i] = i;
1810     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1811       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1812           ->getZExtValue();
1813       if (i == Idx)
1814         isUnmovable[Idx] = true;
1815     }
1816   }
1817
1818   for (unsigned i = 0; i < 4; i++) {
1819     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1820       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1821           ->getZExtValue();
1822       if (isUnmovable[Idx])
1823         continue;
1824       // Swap i and Idx
1825       std::swap(NewBldVec[Idx], NewBldVec[i]);
1826       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1827       break;
1828     }
1829   }
1830
1831   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1832                      VectorEntry.getValueType(), NewBldVec);
1833 }
1834
1835
1836 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1837                                             SDValue Swz[4], SelectionDAG &DAG,
1838                                             SDLoc DL) const {
1839   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1840   // Old -> New swizzle values
1841   DenseMap<unsigned, unsigned> SwizzleRemap;
1842
1843   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1844   for (unsigned i = 0; i < 4; i++) {
1845     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1846     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1847       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1848   }
1849
1850   SwizzleRemap.clear();
1851   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1852   for (unsigned i = 0; i < 4; i++) {
1853     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1854     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1855       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1856   }
1857
1858   return BuildVector;
1859 }
1860
1861
1862 //===----------------------------------------------------------------------===//
1863 // Custom DAG Optimizations
1864 //===----------------------------------------------------------------------===//
1865
1866 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1867                                               DAGCombinerInfo &DCI) const {
1868   SelectionDAG &DAG = DCI.DAG;
1869
1870   switch (N->getOpcode()) {
1871   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1872   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1873   case ISD::FP_ROUND: {
1874       SDValue Arg = N->getOperand(0);
1875       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1876         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1877                            Arg.getOperand(0));
1878       }
1879       break;
1880     }
1881
1882   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1883   // (i32 select_cc f32, f32, -1, 0 cc)
1884   //
1885   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1886   // this to one of the SET*_DX10 instructions.
1887   case ISD::FP_TO_SINT: {
1888     SDValue FNeg = N->getOperand(0);
1889     if (FNeg.getOpcode() != ISD::FNEG) {
1890       return SDValue();
1891     }
1892     SDValue SelectCC = FNeg.getOperand(0);
1893     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1894         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1895         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1896         !isHWTrueValue(SelectCC.getOperand(2)) ||
1897         !isHWFalseValue(SelectCC.getOperand(3))) {
1898       return SDValue();
1899     }
1900
1901     SDLoc dl(N);
1902     return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1903                            SelectCC.getOperand(0), // LHS
1904                            SelectCC.getOperand(1), // RHS
1905                            DAG.getConstant(-1, dl, MVT::i32), // True
1906                            DAG.getConstant(0, dl, MVT::i32),  // False
1907                            SelectCC.getOperand(4)); // CC
1908
1909     break;
1910   }
1911
1912   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1913   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1914   case ISD::INSERT_VECTOR_ELT: {
1915     SDValue InVec = N->getOperand(0);
1916     SDValue InVal = N->getOperand(1);
1917     SDValue EltNo = N->getOperand(2);
1918     SDLoc dl(N);
1919
1920     // If the inserted element is an UNDEF, just use the input vector.
1921     if (InVal.getOpcode() == ISD::UNDEF)
1922       return InVec;
1923
1924     EVT VT = InVec.getValueType();
1925
1926     // If we can't generate a legal BUILD_VECTOR, exit
1927     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1928       return SDValue();
1929
1930     // Check that we know which element is being inserted
1931     if (!isa<ConstantSDNode>(EltNo))
1932       return SDValue();
1933     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1934
1935     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1936     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1937     // vector elements.
1938     SmallVector<SDValue, 8> Ops;
1939     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1940       Ops.append(InVec.getNode()->op_begin(),
1941                  InVec.getNode()->op_end());
1942     } else if (InVec.getOpcode() == ISD::UNDEF) {
1943       unsigned NElts = VT.getVectorNumElements();
1944       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1945     } else {
1946       return SDValue();
1947     }
1948
1949     // Insert the element
1950     if (Elt < Ops.size()) {
1951       // All the operands of BUILD_VECTOR must have the same type;
1952       // we enforce that here.
1953       EVT OpVT = Ops[0].getValueType();
1954       if (InVal.getValueType() != OpVT)
1955         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1956           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1957           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1958       Ops[Elt] = InVal;
1959     }
1960
1961     // Return the new vector
1962     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1963   }
1964
1965   // Extract_vec (Build_vector) generated by custom lowering
1966   // also needs to be customly combined
1967   case ISD::EXTRACT_VECTOR_ELT: {
1968     SDValue Arg = N->getOperand(0);
1969     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1970       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1971         unsigned Element = Const->getZExtValue();
1972         return Arg->getOperand(Element);
1973       }
1974     }
1975     if (Arg.getOpcode() == ISD::BITCAST &&
1976         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1977       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1978         unsigned Element = Const->getZExtValue();
1979         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1980             Arg->getOperand(0).getOperand(Element));
1981       }
1982     }
1983   }
1984
1985   case ISD::SELECT_CC: {
1986     // Try common optimizations
1987     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1988     if (Ret.getNode())
1989       return Ret;
1990
1991     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1992     //      selectcc x, y, a, b, inv(cc)
1993     //
1994     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1995     //      selectcc x, y, a, b, cc
1996     SDValue LHS = N->getOperand(0);
1997     if (LHS.getOpcode() != ISD::SELECT_CC) {
1998       return SDValue();
1999     }
2000
2001     SDValue RHS = N->getOperand(1);
2002     SDValue True = N->getOperand(2);
2003     SDValue False = N->getOperand(3);
2004     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2005
2006     if (LHS.getOperand(2).getNode() != True.getNode() ||
2007         LHS.getOperand(3).getNode() != False.getNode() ||
2008         RHS.getNode() != False.getNode()) {
2009       return SDValue();
2010     }
2011
2012     switch (NCC) {
2013     default: return SDValue();
2014     case ISD::SETNE: return LHS;
2015     case ISD::SETEQ: {
2016       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2017       LHSCC = ISD::getSetCCInverse(LHSCC,
2018                                   LHS.getOperand(0).getValueType().isInteger());
2019       if (DCI.isBeforeLegalizeOps() ||
2020           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2021         return DAG.getSelectCC(SDLoc(N),
2022                                LHS.getOperand(0),
2023                                LHS.getOperand(1),
2024                                LHS.getOperand(2),
2025                                LHS.getOperand(3),
2026                                LHSCC);
2027       break;
2028     }
2029     }
2030     return SDValue();
2031   }
2032
2033   case AMDGPUISD::EXPORT: {
2034     SDValue Arg = N->getOperand(1);
2035     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2036       break;
2037
2038     SDValue NewArgs[8] = {
2039       N->getOperand(0), // Chain
2040       SDValue(),
2041       N->getOperand(2), // ArrayBase
2042       N->getOperand(3), // Type
2043       N->getOperand(4), // SWZ_X
2044       N->getOperand(5), // SWZ_Y
2045       N->getOperand(6), // SWZ_Z
2046       N->getOperand(7) // SWZ_W
2047     };
2048     SDLoc DL(N);
2049     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2050     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2051   }
2052   case AMDGPUISD::TEXTURE_FETCH: {
2053     SDValue Arg = N->getOperand(1);
2054     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2055       break;
2056
2057     SDValue NewArgs[19] = {
2058       N->getOperand(0),
2059       N->getOperand(1),
2060       N->getOperand(2),
2061       N->getOperand(3),
2062       N->getOperand(4),
2063       N->getOperand(5),
2064       N->getOperand(6),
2065       N->getOperand(7),
2066       N->getOperand(8),
2067       N->getOperand(9),
2068       N->getOperand(10),
2069       N->getOperand(11),
2070       N->getOperand(12),
2071       N->getOperand(13),
2072       N->getOperand(14),
2073       N->getOperand(15),
2074       N->getOperand(16),
2075       N->getOperand(17),
2076       N->getOperand(18),
2077     };
2078     SDLoc DL(N);
2079     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2080     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2081   }
2082   }
2083
2084   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2085 }
2086
2087 static bool
2088 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2089             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2090   const R600InstrInfo *TII =
2091       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2092   if (!Src.isMachineOpcode())
2093     return false;
2094   switch (Src.getMachineOpcode()) {
2095   case AMDGPU::FNEG_R600:
2096     if (!Neg.getNode())
2097       return false;
2098     Src = Src.getOperand(0);
2099     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2100     return true;
2101   case AMDGPU::FABS_R600:
2102     if (!Abs.getNode())
2103       return false;
2104     Src = Src.getOperand(0);
2105     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2106     return true;
2107   case AMDGPU::CONST_COPY: {
2108     unsigned Opcode = ParentNode->getMachineOpcode();
2109     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2110
2111     if (!Sel.getNode())
2112       return false;
2113
2114     SDValue CstOffset = Src.getOperand(0);
2115     if (ParentNode->getValueType(0).isVector())
2116       return false;
2117
2118     // Gather constants values
2119     int SrcIndices[] = {
2120       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2121       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2122       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2123       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2124       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2125       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2126       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2127       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2128       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2129       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2130       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2131     };
2132     std::vector<unsigned> Consts;
2133     for (int OtherSrcIdx : SrcIndices) {
2134       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2135       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2136         continue;
2137       if (HasDst) {
2138         OtherSrcIdx--;
2139         OtherSelIdx--;
2140       }
2141       if (RegisterSDNode *Reg =
2142           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2143         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2144           ConstantSDNode *Cst
2145             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2146           Consts.push_back(Cst->getZExtValue());
2147         }
2148       }
2149     }
2150
2151     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2152     Consts.push_back(Cst->getZExtValue());
2153     if (!TII->fitsConstReadLimitations(Consts)) {
2154       return false;
2155     }
2156
2157     Sel = CstOffset;
2158     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2159     return true;
2160   }
2161   case AMDGPU::MOV_IMM_I32:
2162   case AMDGPU::MOV_IMM_F32: {
2163     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2164     uint64_t ImmValue = 0;
2165
2166
2167     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2168       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2169       float FloatValue = FPC->getValueAPF().convertToFloat();
2170       if (FloatValue == 0.0) {
2171         ImmReg = AMDGPU::ZERO;
2172       } else if (FloatValue == 0.5) {
2173         ImmReg = AMDGPU::HALF;
2174       } else if (FloatValue == 1.0) {
2175         ImmReg = AMDGPU::ONE;
2176       } else {
2177         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2178       }
2179     } else {
2180       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2181       uint64_t Value = C->getZExtValue();
2182       if (Value == 0) {
2183         ImmReg = AMDGPU::ZERO;
2184       } else if (Value == 1) {
2185         ImmReg = AMDGPU::ONE_INT;
2186       } else {
2187         ImmValue = Value;
2188       }
2189     }
2190
2191     // Check that we aren't already using an immediate.
2192     // XXX: It's possible for an instruction to have more than one
2193     // immediate operand, but this is not supported yet.
2194     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2195       if (!Imm.getNode())
2196         return false;
2197       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2198       assert(C);
2199       if (C->getZExtValue())
2200         return false;
2201       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2202     }
2203     Src = DAG.getRegister(ImmReg, MVT::i32);
2204     return true;
2205   }
2206   default:
2207     return false;
2208   }
2209 }
2210
2211
2212 /// \brief Fold the instructions after selecting them
2213 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2214                                             SelectionDAG &DAG) const {
2215   const R600InstrInfo *TII =
2216       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2217   if (!Node->isMachineOpcode())
2218     return Node;
2219   unsigned Opcode = Node->getMachineOpcode();
2220   SDValue FakeOp;
2221
2222   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2223
2224   if (Opcode == AMDGPU::DOT_4) {
2225     int OperandIdx[] = {
2226       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2227       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2228       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2229       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2230       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2231       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2232       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2233       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2234         };
2235     int NegIdx[] = {
2236       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2237       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2238       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2239       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2240       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2241       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2242       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2243       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2244     };
2245     int AbsIdx[] = {
2246       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2247       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2248       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2249       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2250       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2251       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2252       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2253       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2254     };
2255     for (unsigned i = 0; i < 8; i++) {
2256       if (OperandIdx[i] < 0)
2257         return Node;
2258       SDValue &Src = Ops[OperandIdx[i] - 1];
2259       SDValue &Neg = Ops[NegIdx[i] - 1];
2260       SDValue &Abs = Ops[AbsIdx[i] - 1];
2261       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2262       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2263       if (HasDst)
2264         SelIdx--;
2265       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2266       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2267         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2268     }
2269   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2270     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2271       SDValue &Src = Ops[i];
2272       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2273         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2274     }
2275   } else if (Opcode == AMDGPU::CLAMP_R600) {
2276     SDValue Src = Node->getOperand(0);
2277     if (!Src.isMachineOpcode() ||
2278         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2279       return Node;
2280     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2281         AMDGPU::OpName::clamp);
2282     if (ClampIdx < 0)
2283       return Node;
2284     SDLoc DL(Node);
2285     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2286     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2287     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2288                               Node->getVTList(), Ops);
2289   } else {
2290     if (!TII->hasInstrModifiers(Opcode))
2291       return Node;
2292     int OperandIdx[] = {
2293       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2294       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2295       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2296     };
2297     int NegIdx[] = {
2298       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2299       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2300       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2301     };
2302     int AbsIdx[] = {
2303       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2304       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2305       -1
2306     };
2307     for (unsigned i = 0; i < 3; i++) {
2308       if (OperandIdx[i] < 0)
2309         return Node;
2310       SDValue &Src = Ops[OperandIdx[i] - 1];
2311       SDValue &Neg = Ops[NegIdx[i] - 1];
2312       SDValue FakeAbs;
2313       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2314       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2315       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2316       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2317       if (HasDst) {
2318         SelIdx--;
2319         ImmIdx--;
2320       }
2321       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2322       SDValue &Imm = Ops[ImmIdx];
2323       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2324         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2325     }
2326   }
2327
2328   return Node;
2329 }