lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
  34                                        const AMDGPUSubtarget &STI)
  35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties(STI.getRegisterInfo());
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // Expand sign extension of vectors
  95   if (!Subtarget->hasBFE())
  96     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  97
  98   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 100
 101   if (!Subtarget->hasBFE())
 102     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 105
 106   if (!Subtarget->hasBFE())
 107     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 110
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 114
 115   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 116
 117
 118   // Legalize loads and stores to the private address space.
 119   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 120   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 121   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 122
 123   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 124   // spaces, so it is custom lowered to handle those where it isn't.
 125   for (MVT VT : MVT::integer_valuetypes()) {
 126     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 127     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
 128     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
 129
 130     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 131     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
 132     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
 133
 134     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 135     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
 136     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
 137   }
 138
 139   setOperationAction(ISD::STORE, MVT::i8, Custom);
 140   setOperationAction(ISD::STORE, MVT::i32, Custom);
 141   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 142   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 143   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 144   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 145
 146   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 147   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 148   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 149
 150   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 151   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 152   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 153   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 154
 155   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 156   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 157   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 158   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 159
 160   setTargetDAGCombine(ISD::FP_ROUND);
 161   setTargetDAGCombine(ISD::FP_TO_SINT);
 162   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 163   setTargetDAGCombine(ISD::SELECT_CC);
 164   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 165
 166   setOperationAction(ISD::SUB, MVT::i64, Expand);
 167
 168   // These should be replaced by UDVIREM, but it does not happen automatically
 169   // during Type Legalization
 170   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 171   setOperationAction(ISD::UREM, MVT::i64, Custom);
 172   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 173   setOperationAction(ISD::SREM, MVT::i64, Custom);
 174
 175   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 176   //  to be Legal/Custom in order to avoid library calls.
 177   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 178   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 179   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 180
 181   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 182
 183   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 184   for (MVT VT : ScalarIntVTs) {
 185     setOperationAction(ISD::ADDC, VT, Expand);
 186     setOperationAction(ISD::SUBC, VT, Expand);
 187     setOperationAction(ISD::ADDE, VT, Expand);
 188     setOperationAction(ISD::SUBE, VT, Expand);
 189   }
 190
 191   setSchedulingPreference(Sched::Source);
 192 }
 193
 194 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 195     MachineInstr * MI, MachineBasicBlock * BB) const {
 196   MachineFunction * MF = BB->getParent();
 197   MachineRegisterInfo &MRI = MF->getRegInfo();
 198   MachineBasicBlock::iterator I = *MI;
 199   const R600InstrInfo *TII =
 200       static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 201
 202   switch (MI->getOpcode()) {
 203   default:
 204     // Replace LDS_*_RET instruction that don't have any uses with the
 205     // equivalent LDS_*_NORET instruction.
 206     if (TII->isLDSRetInstr(MI->getOpcode())) {
 207       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 208       assert(DstIdx != -1);
 209       MachineInstrBuilder NewMI;
 210       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 211       //        LDS_1A2D support and remove this special case.
 212       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
 213            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
 214         return BB;
 215
 216       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 217                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 218       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 219         NewMI.addOperand(MI->getOperand(i));
 220       }
 221     } else {
 222       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 223     }
 224     break;
 225   case AMDGPU::CLAMP_R600: {
 226     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 227                                                    AMDGPU::MOV,
 228                                                    MI->getOperand(0).getReg(),
 229                                                    MI->getOperand(1).getReg());
 230     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 231     break;
 232   }
 233
 234   case AMDGPU::FABS_R600: {
 235     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 236                                                     AMDGPU::MOV,
 237                                                     MI->getOperand(0).getReg(),
 238                                                     MI->getOperand(1).getReg());
 239     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 240     break;
 241   }
 242
 243   case AMDGPU::FNEG_R600: {
 244     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 245                                                     AMDGPU::MOV,
 246                                                     MI->getOperand(0).getReg(),
 247                                                     MI->getOperand(1).getReg());
 248     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 249     break;
 250   }
 251
 252   case AMDGPU::MASK_WRITE: {
 253     unsigned maskedRegister = MI->getOperand(0).getReg();
 254     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 255     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 256     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 257     break;
 258   }
 259
 260   case AMDGPU::MOV_IMM_F32:
 261     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 262                      MI->getOperand(1).getFPImm()->getValueAPF()
 263                          .bitcastToAPInt().getZExtValue());
 264     break;
 265   case AMDGPU::MOV_IMM_I32:
 266     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 267                      MI->getOperand(1).getImm());
 268     break;
 269   case AMDGPU::CONST_COPY: {
 270     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 271         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 272     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 273         MI->getOperand(1).getImm());
 274     break;
 275   }
 276
 277   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 278   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 279   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 280     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 281
 282     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 283             .addOperand(MI->getOperand(0))
 284             .addOperand(MI->getOperand(1))
 285             .addImm(EOP); // Set End of program bit
 286     break;
 287   }
 288
 289   case AMDGPU::TXD: {
 290     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 291     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 292     MachineOperand &RID = MI->getOperand(4);
 293     MachineOperand &SID = MI->getOperand(5);
 294     unsigned TextureId = MI->getOperand(6).getImm();
 295     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 296     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 297
 298     switch (TextureId) {
 299     case 5: // Rect
 300       CTX = CTY = 0;
 301       break;
 302     case 6: // Shadow1D
 303       SrcW = SrcZ;
 304       break;
 305     case 7: // Shadow2D
 306       SrcW = SrcZ;
 307       break;
 308     case 8: // ShadowRect
 309       CTX = CTY = 0;
 310       SrcW = SrcZ;
 311       break;
 312     case 9: // 1DArray
 313       SrcZ = SrcY;
 314       CTZ = 0;
 315       break;
 316     case 10: // 2DArray
 317       CTZ = 0;
 318       break;
 319     case 11: // Shadow1DArray
 320       SrcZ = SrcY;
 321       CTZ = 0;
 322       break;
 323     case 12: // Shadow2DArray
 324       CTZ = 0;
 325       break;
 326     }
 327     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 328             .addOperand(MI->getOperand(3))
 329             .addImm(SrcX)
 330             .addImm(SrcY)
 331             .addImm(SrcZ)
 332             .addImm(SrcW)
 333             .addImm(0)
 334             .addImm(0)
 335             .addImm(0)
 336             .addImm(0)
 337             .addImm(1)
 338             .addImm(2)
 339             .addImm(3)
 340             .addOperand(RID)
 341             .addOperand(SID)
 342             .addImm(CTX)
 343             .addImm(CTY)
 344             .addImm(CTZ)
 345             .addImm(CTW);
 346     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 347             .addOperand(MI->getOperand(2))
 348             .addImm(SrcX)
 349             .addImm(SrcY)
 350             .addImm(SrcZ)
 351             .addImm(SrcW)
 352             .addImm(0)
 353             .addImm(0)
 354             .addImm(0)
 355             .addImm(0)
 356             .addImm(1)
 357             .addImm(2)
 358             .addImm(3)
 359             .addOperand(RID)
 360             .addOperand(SID)
 361             .addImm(CTX)
 362             .addImm(CTY)
 363             .addImm(CTZ)
 364             .addImm(CTW);
 365     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 366             .addOperand(MI->getOperand(0))
 367             .addOperand(MI->getOperand(1))
 368             .addImm(SrcX)
 369             .addImm(SrcY)
 370             .addImm(SrcZ)
 371             .addImm(SrcW)
 372             .addImm(0)
 373             .addImm(0)
 374             .addImm(0)
 375             .addImm(0)
 376             .addImm(1)
 377             .addImm(2)
 378             .addImm(3)
 379             .addOperand(RID)
 380             .addOperand(SID)
 381             .addImm(CTX)
 382             .addImm(CTY)
 383             .addImm(CTZ)
 384             .addImm(CTW)
 385             .addReg(T0, RegState::Implicit)
 386             .addReg(T1, RegState::Implicit);
 387     break;
 388   }
 389
 390   case AMDGPU::TXD_SHADOW: {
 391     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 392     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 393     MachineOperand &RID = MI->getOperand(4);
 394     MachineOperand &SID = MI->getOperand(5);
 395     unsigned TextureId = MI->getOperand(6).getImm();
 396     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 397     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 398
 399     switch (TextureId) {
 400     case 5: // Rect
 401       CTX = CTY = 0;
 402       break;
 403     case 6: // Shadow1D
 404       SrcW = SrcZ;
 405       break;
 406     case 7: // Shadow2D
 407       SrcW = SrcZ;
 408       break;
 409     case 8: // ShadowRect
 410       CTX = CTY = 0;
 411       SrcW = SrcZ;
 412       break;
 413     case 9: // 1DArray
 414       SrcZ = SrcY;
 415       CTZ = 0;
 416       break;
 417     case 10: // 2DArray
 418       CTZ = 0;
 419       break;
 420     case 11: // Shadow1DArray
 421       SrcZ = SrcY;
 422       CTZ = 0;
 423       break;
 424     case 12: // Shadow2DArray
 425       CTZ = 0;
 426       break;
 427     }
 428
 429     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 430             .addOperand(MI->getOperand(3))
 431             .addImm(SrcX)
 432             .addImm(SrcY)
 433             .addImm(SrcZ)
 434             .addImm(SrcW)
 435             .addImm(0)
 436             .addImm(0)
 437             .addImm(0)
 438             .addImm(0)
 439             .addImm(1)
 440             .addImm(2)
 441             .addImm(3)
 442             .addOperand(RID)
 443             .addOperand(SID)
 444             .addImm(CTX)
 445             .addImm(CTY)
 446             .addImm(CTZ)
 447             .addImm(CTW);
 448     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 449             .addOperand(MI->getOperand(2))
 450             .addImm(SrcX)
 451             .addImm(SrcY)
 452             .addImm(SrcZ)
 453             .addImm(SrcW)
 454             .addImm(0)
 455             .addImm(0)
 456             .addImm(0)
 457             .addImm(0)
 458             .addImm(1)
 459             .addImm(2)
 460             .addImm(3)
 461             .addOperand(RID)
 462             .addOperand(SID)
 463             .addImm(CTX)
 464             .addImm(CTY)
 465             .addImm(CTZ)
 466             .addImm(CTW);
 467     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 468             .addOperand(MI->getOperand(0))
 469             .addOperand(MI->getOperand(1))
 470             .addImm(SrcX)
 471             .addImm(SrcY)
 472             .addImm(SrcZ)
 473             .addImm(SrcW)
 474             .addImm(0)
 475             .addImm(0)
 476             .addImm(0)
 477             .addImm(0)
 478             .addImm(1)
 479             .addImm(2)
 480             .addImm(3)
 481             .addOperand(RID)
 482             .addOperand(SID)
 483             .addImm(CTX)
 484             .addImm(CTY)
 485             .addImm(CTZ)
 486             .addImm(CTW)
 487             .addReg(T0, RegState::Implicit)
 488             .addReg(T1, RegState::Implicit);
 489     break;
 490   }
 491
 492   case AMDGPU::BRANCH:
 493       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 494               .addOperand(MI->getOperand(0));
 495       break;
 496
 497   case AMDGPU::BRANCH_COND_f32: {
 498     MachineInstr *NewMI =
 499       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 500               AMDGPU::PREDICATE_BIT)
 501               .addOperand(MI->getOperand(1))
 502               .addImm(OPCODE_IS_NOT_ZERO)
 503               .addImm(0); // Flags
 504     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 505     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 506             .addOperand(MI->getOperand(0))
 507             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 508     break;
 509   }
 510
 511   case AMDGPU::BRANCH_COND_i32: {
 512     MachineInstr *NewMI =
 513       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 514             AMDGPU::PREDICATE_BIT)
 515             .addOperand(MI->getOperand(1))
 516             .addImm(OPCODE_IS_NOT_ZERO_INT)
 517             .addImm(0); // Flags
 518     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 519     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 520            .addOperand(MI->getOperand(0))
 521             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 522     break;
 523   }
 524
 525   case AMDGPU::EG_ExportSwz:
 526   case AMDGPU::R600_ExportSwz: {
 527     // Instruction is left unmodified if its not the last one of its type
 528     bool isLastInstructionOfItsType = true;
 529     unsigned InstExportType = MI->getOperand(1).getImm();
 530     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 531          EndBlock = BB->end(); NextExportInst != EndBlock;
 532          NextExportInst = std::next(NextExportInst)) {
 533       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 534           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 535         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 536             .getImm();
 537         if (CurrentInstExportType == InstExportType) {
 538           isLastInstructionOfItsType = false;
 539           break;
 540         }
 541       }
 542     }
 543     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 544     if (!EOP && !isLastInstructionOfItsType)
 545       return BB;
 546     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 547     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 548             .addOperand(MI->getOperand(0))
 549             .addOperand(MI->getOperand(1))
 550             .addOperand(MI->getOperand(2))
 551             .addOperand(MI->getOperand(3))
 552             .addOperand(MI->getOperand(4))
 553             .addOperand(MI->getOperand(5))
 554             .addOperand(MI->getOperand(6))
 555             .addImm(CfInst)
 556             .addImm(EOP);
 557     break;
 558   }
 559   case AMDGPU::RETURN: {
 560     // RETURN instructions must have the live-out registers as implicit uses,
 561     // otherwise they appear dead.
 562     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 563     MachineInstrBuilder MIB(*MF, MI);
 564     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 565       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 566     return BB;
 567   }
 568   }
 569
 570   MI->eraseFromParent();
 571   return BB;
 572 }
 573
 574 //===----------------------------------------------------------------------===//
 575 // Custom DAG Lowering Operations
 576 //===----------------------------------------------------------------------===//
 577
 578 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 579   MachineFunction &MF = DAG.getMachineFunction();
 580   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 581   switch (Op.getOpcode()) {
 582   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 583   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 584   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 585   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 586   case ISD::SRA_PARTS:
 587   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 588   case ISD::FCOS:
 589   case ISD::FSIN: return LowerTrig(Op, DAG);
 590   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 591   case ISD::STORE: return LowerSTORE(Op, DAG);
 592   case ISD::LOAD: {
 593     SDValue Result = LowerLOAD(Op, DAG);
 594     assert((!Result.getNode() ||
 595             Result.getNode()->getNumValues() == 2) &&
 596            "Load should return a value and a chain");
 597     return Result;
 598   }
 599
 600   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 601   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 602   case ISD::INTRINSIC_VOID: {
 603     SDValue Chain = Op.getOperand(0);
 604     unsigned IntrinsicID =
 605                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 606     switch (IntrinsicID) {
 607     case AMDGPUIntrinsic::AMDGPU_store_output: {
 608       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 609       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 610       MFI->LiveOuts.push_back(Reg);
 611       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 612     }
 613     case AMDGPUIntrinsic::R600_store_swizzle: {
 614       SDLoc DL(Op);
 615       const SDValue Args[8] = {
 616         Chain,
 617         Op.getOperand(2), // Export Value
 618         Op.getOperand(3), // ArrayBase
 619         Op.getOperand(4), // Type
 620         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
 621         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
 622         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
 623         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
 624       };
 625       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
 626     }
 627
 628     // default for switch(IntrinsicID)
 629     default: break;
 630     }
 631     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 632     break;
 633   }
 634   case ISD::INTRINSIC_WO_CHAIN: {
 635     unsigned IntrinsicID =
 636                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 637     EVT VT = Op.getValueType();
 638     SDLoc DL(Op);
 639     switch(IntrinsicID) {
 640     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 641     case AMDGPUIntrinsic::R600_load_input: {
 642       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 643       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 644       MachineFunction &MF = DAG.getMachineFunction();
 645       MachineRegisterInfo &MRI = MF.getRegInfo();
 646       MRI.addLiveIn(Reg);
 647       return DAG.getCopyFromReg(DAG.getEntryNode(),
 648           SDLoc(DAG.getEntryNode()), Reg, VT);
 649     }
 650
 651     case AMDGPUIntrinsic::R600_interp_input: {
 652       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 653       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 654       MachineSDNode *interp;
 655       if (ijb < 0) {
 656         const R600InstrInfo *TII =
 657             static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 658         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 659             MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
 660         return DAG.getTargetExtractSubreg(
 661             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 662             DL, MVT::f32, SDValue(interp, 0));
 663       }
 664       MachineFunction &MF = DAG.getMachineFunction();
 665       MachineRegisterInfo &MRI = MF.getRegInfo();
 666       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 667       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 668       MRI.addLiveIn(RegisterI);
 669       MRI.addLiveIn(RegisterJ);
 670       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 671           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 672       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 673           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 674
 675       if (slot % 4 < 2)
 676         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 677             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
 678             RegisterJNode, RegisterINode);
 679       else
 680         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 681             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
 682             RegisterJNode, RegisterINode);
 683       return SDValue(interp, slot % 2);
 684     }
 685     case AMDGPUIntrinsic::R600_interp_xy:
 686     case AMDGPUIntrinsic::R600_interp_zw: {
 687       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 688       MachineSDNode *interp;
 689       SDValue RegisterINode = Op.getOperand(2);
 690       SDValue RegisterJNode = Op.getOperand(3);
 691
 692       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 693         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 694             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
 695             RegisterJNode, RegisterINode);
 696       else
 697         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 698             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
 699             RegisterJNode, RegisterINode);
 700       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 701           SDValue(interp, 0), SDValue(interp, 1));
 702     }
 703     case AMDGPUIntrinsic::R600_tex:
 704     case AMDGPUIntrinsic::R600_texc:
 705     case AMDGPUIntrinsic::R600_txl:
 706     case AMDGPUIntrinsic::R600_txlc:
 707     case AMDGPUIntrinsic::R600_txb:
 708     case AMDGPUIntrinsic::R600_txbc:
 709     case AMDGPUIntrinsic::R600_txf:
 710     case AMDGPUIntrinsic::R600_txq:
 711     case AMDGPUIntrinsic::R600_ddx:
 712     case AMDGPUIntrinsic::R600_ddy:
 713     case AMDGPUIntrinsic::R600_ldptr: {
 714       unsigned TextureOp;
 715       switch (IntrinsicID) {
 716       case AMDGPUIntrinsic::R600_tex:
 717         TextureOp = 0;
 718         break;
 719       case AMDGPUIntrinsic::R600_texc:
 720         TextureOp = 1;
 721         break;
 722       case AMDGPUIntrinsic::R600_txl:
 723         TextureOp = 2;
 724         break;
 725       case AMDGPUIntrinsic::R600_txlc:
 726         TextureOp = 3;
 727         break;
 728       case AMDGPUIntrinsic::R600_txb:
 729         TextureOp = 4;
 730         break;
 731       case AMDGPUIntrinsic::R600_txbc:
 732         TextureOp = 5;
 733         break;
 734       case AMDGPUIntrinsic::R600_txf:
 735         TextureOp = 6;
 736         break;
 737       case AMDGPUIntrinsic::R600_txq:
 738         TextureOp = 7;
 739         break;
 740       case AMDGPUIntrinsic::R600_ddx:
 741         TextureOp = 8;
 742         break;
 743       case AMDGPUIntrinsic::R600_ddy:
 744         TextureOp = 9;
 745         break;
 746       case AMDGPUIntrinsic::R600_ldptr:
 747         TextureOp = 10;
 748         break;
 749       default:
 750         llvm_unreachable("Unknow Texture Operation");
 751       }
 752
 753       SDValue TexArgs[19] = {
 754         DAG.getConstant(TextureOp, DL, MVT::i32),
 755         Op.getOperand(1),
 756         DAG.getConstant(0, DL, MVT::i32),
 757         DAG.getConstant(1, DL, MVT::i32),
 758         DAG.getConstant(2, DL, MVT::i32),
 759         DAG.getConstant(3, DL, MVT::i32),
 760         Op.getOperand(2),
 761         Op.getOperand(3),
 762         Op.getOperand(4),
 763         DAG.getConstant(0, DL, MVT::i32),
 764         DAG.getConstant(1, DL, MVT::i32),
 765         DAG.getConstant(2, DL, MVT::i32),
 766         DAG.getConstant(3, DL, MVT::i32),
 767         Op.getOperand(5),
 768         Op.getOperand(6),
 769         Op.getOperand(7),
 770         Op.getOperand(8),
 771         Op.getOperand(9),
 772         Op.getOperand(10)
 773       };
 774       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 775     }
 776     case AMDGPUIntrinsic::AMDGPU_dp4: {
 777       SDValue Args[8] = {
 778       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 779           DAG.getConstant(0, DL, MVT::i32)),
 780       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 781           DAG.getConstant(0, DL, MVT::i32)),
 782       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 783           DAG.getConstant(1, DL, MVT::i32)),
 784       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 785           DAG.getConstant(1, DL, MVT::i32)),
 786       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 787           DAG.getConstant(2, DL, MVT::i32)),
 788       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 789           DAG.getConstant(2, DL, MVT::i32)),
 790       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 791           DAG.getConstant(3, DL, MVT::i32)),
 792       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 793           DAG.getConstant(3, DL, MVT::i32))
 794       };
 795       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 796     }
 797
 798     case Intrinsic::r600_read_ngroups_x:
 799       return LowerImplicitParameter(DAG, VT, DL, 0);
 800     case Intrinsic::r600_read_ngroups_y:
 801       return LowerImplicitParameter(DAG, VT, DL, 1);
 802     case Intrinsic::r600_read_ngroups_z:
 803       return LowerImplicitParameter(DAG, VT, DL, 2);
 804     case Intrinsic::r600_read_global_size_x:
 805       return LowerImplicitParameter(DAG, VT, DL, 3);
 806     case Intrinsic::r600_read_global_size_y:
 807       return LowerImplicitParameter(DAG, VT, DL, 4);
 808     case Intrinsic::r600_read_global_size_z:
 809       return LowerImplicitParameter(DAG, VT, DL, 5);
 810     case Intrinsic::r600_read_local_size_x:
 811       return LowerImplicitParameter(DAG, VT, DL, 6);
 812     case Intrinsic::r600_read_local_size_y:
 813       return LowerImplicitParameter(DAG, VT, DL, 7);
 814     case Intrinsic::r600_read_local_size_z:
 815       return LowerImplicitParameter(DAG, VT, DL, 8);
 816
 817     case Intrinsic::AMDGPU_read_workdim:
 818       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
 819
 820     case Intrinsic::r600_read_tgid_x:
 821       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 822                                   AMDGPU::T1_X, VT);
 823     case Intrinsic::r600_read_tgid_y:
 824       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 825                                   AMDGPU::T1_Y, VT);
 826     case Intrinsic::r600_read_tgid_z:
 827       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 828                                   AMDGPU::T1_Z, VT);
 829     case Intrinsic::r600_read_tidig_x:
 830       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 831                                   AMDGPU::T0_X, VT);
 832     case Intrinsic::r600_read_tidig_y:
 833       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 834                                   AMDGPU::T0_Y, VT);
 835     case Intrinsic::r600_read_tidig_z:
 836       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 837                                   AMDGPU::T0_Z, VT);
 838     case Intrinsic::AMDGPU_rsq:
 839       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 840       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 841
 842     case AMDGPUIntrinsic::AMDGPU_fract:
 843     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
 844       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
 845     }
 846     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 847     break;
 848   }
 849   } // end switch(Op.getOpcode())
 850   return SDValue();
 851 }
 852
 853 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 854                                             SmallVectorImpl<SDValue> &Results,
 855                                             SelectionDAG &DAG) const {
 856   switch (N->getOpcode()) {
 857   default:
 858     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 859     return;
 860   case ISD::FP_TO_UINT:
 861     if (N->getValueType(0) == MVT::i1) {
 862       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 863       return;
 864     }
 865     // Fall-through. Since we don't care about out of bounds values
 866     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 867     // considers some extra cases which are not necessary here.
 868   case ISD::FP_TO_SINT: {
 869     SDValue Result;
 870     if (expandFP_TO_SINT(N, Result, DAG))
 871       Results.push_back(Result);
 872     return;
 873   }
 874   case ISD::UDIV: {
 875     SDValue Op = SDValue(N, 0);
 876     SDLoc DL(Op);
 877     EVT VT = Op.getValueType();
 878     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 879       N->getOperand(0), N->getOperand(1));
 880     Results.push_back(UDIVREM);
 881     break;
 882   }
 883   case ISD::UREM: {
 884     SDValue Op = SDValue(N, 0);
 885     SDLoc DL(Op);
 886     EVT VT = Op.getValueType();
 887     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 888       N->getOperand(0), N->getOperand(1));
 889     Results.push_back(UDIVREM.getValue(1));
 890     break;
 891   }
 892   case ISD::SDIV: {
 893     SDValue Op = SDValue(N, 0);
 894     SDLoc DL(Op);
 895     EVT VT = Op.getValueType();
 896     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 897       N->getOperand(0), N->getOperand(1));
 898     Results.push_back(SDIVREM);
 899     break;
 900   }
 901   case ISD::SREM: {
 902     SDValue Op = SDValue(N, 0);
 903     SDLoc DL(Op);
 904     EVT VT = Op.getValueType();
 905     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 906       N->getOperand(0), N->getOperand(1));
 907     Results.push_back(SDIVREM.getValue(1));
 908     break;
 909   }
 910   case ISD::SDIVREM: {
 911     SDValue Op = SDValue(N, 1);
 912     SDValue RES = LowerSDIVREM(Op, DAG);
 913     Results.push_back(RES);
 914     Results.push_back(RES.getValue(1));
 915     break;
 916   }
 917   case ISD::UDIVREM: {
 918     SDValue Op = SDValue(N, 0);
 919     LowerUDIVREM64(Op, DAG, Results);
 920     break;
 921   }
 922   }
 923 }
 924
 925 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 926                                                    SDValue Vector) const {
 927
 928   SDLoc DL(Vector);
 929   EVT VecVT = Vector.getValueType();
 930   EVT EltVT = VecVT.getVectorElementType();
 931   SmallVector<SDValue, 8> Args;
 932
 933   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 934                                                            i != e; ++i) {
 935     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
 936                                DAG.getConstant(i, DL, getVectorIdxTy())));
 937   }
 938
 939   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 940 }
 941
 942 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 943                                                     SelectionDAG &DAG) const {
 944
 945   SDLoc DL(Op);
 946   SDValue Vector = Op.getOperand(0);
 947   SDValue Index = Op.getOperand(1);
 948
 949   if (isa<ConstantSDNode>(Index) ||
 950       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 951     return Op;
 952
 953   Vector = vectorToVerticalVector(DAG, Vector);
 954   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 955                      Vector, Index);
 956 }
 957
 958 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 959                                                    SelectionDAG &DAG) const {
 960   SDLoc DL(Op);
 961   SDValue Vector = Op.getOperand(0);
 962   SDValue Value = Op.getOperand(1);
 963   SDValue Index = Op.getOperand(2);
 964
 965   if (isa<ConstantSDNode>(Index) ||
 966       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 967     return Op;
 968
 969   Vector = vectorToVerticalVector(DAG, Vector);
 970   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 971                                Vector, Value, Index);
 972   return vectorToVerticalVector(DAG, Insert);
 973 }
 974
 975 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 976   // On hw >= R700, COS/SIN input must be between -1. and 1.
 977   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 978   EVT VT = Op.getValueType();
 979   SDValue Arg = Op.getOperand(0);
 980   SDLoc DL(Op);
 981   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
 982       DAG.getNode(ISD::FADD, DL, VT,
 983         DAG.getNode(ISD::FMUL, DL, VT, Arg,
 984           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
 985         DAG.getConstantFP(0.5, DL, MVT::f32)));
 986   unsigned TrigNode;
 987   switch (Op.getOpcode()) {
 988   case ISD::FCOS:
 989     TrigNode = AMDGPUISD::COS_HW;
 990     break;
 991   case ISD::FSIN:
 992     TrigNode = AMDGPUISD::SIN_HW;
 993     break;
 994   default:
 995     llvm_unreachable("Wrong trig opcode");
 996   }
 997   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
 998       DAG.getNode(ISD::FADD, DL, VT, FractPart,
 999         DAG.getConstantFP(-0.5, DL, MVT::f32)));
1000   if (Gen >= AMDGPUSubtarget::R700)
1001     return TrigVal;
1002   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1003   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
1004       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
1005 }
1006
1007 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1008   SDLoc DL(Op);
1009   EVT VT = Op.getValueType();
1010
1011   SDValue Lo = Op.getOperand(0);
1012   SDValue Hi = Op.getOperand(1);
1013   SDValue Shift = Op.getOperand(2);
1014   SDValue Zero = DAG.getConstant(0, DL, VT);
1015   SDValue One  = DAG.getConstant(1, DL, VT);
1016
1017   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1018   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1019   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1020   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1021
1022   // The dance around Width1 is necessary for 0 special case.
1023   // Without it the CompShift might be 32, producing incorrect results in
1024   // Overflow. So we do the shift in two steps, the alternative is to
1025   // add a conditional to filter the special case.
1026
1027   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1028   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1029
1030   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1031   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1032   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1033
1034   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1035   SDValue LoBig = Zero;
1036
1037   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1038   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1039
1040   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1041 }
1042
1043 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1044   SDLoc DL(Op);
1045   EVT VT = Op.getValueType();
1046
1047   SDValue Lo = Op.getOperand(0);
1048   SDValue Hi = Op.getOperand(1);
1049   SDValue Shift = Op.getOperand(2);
1050   SDValue Zero = DAG.getConstant(0, DL, VT);
1051   SDValue One  = DAG.getConstant(1, DL, VT);
1052
1053   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1054
1055   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1056   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1057   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1058   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1059
1060   // The dance around Width1 is necessary for 0 special case.
1061   // Without it the CompShift might be 32, producing incorrect results in
1062   // Overflow. So we do the shift in two steps, the alternative is to
1063   // add a conditional to filter the special case.
1064
1065   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1066   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1067
1068   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1069   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1070   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1071
1072   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1073   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1074
1075   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1076   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1077
1078   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1079 }
1080
1081 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1082   SDLoc DL(Op);
1083   return DAG.getNode(
1084       ISD::SETCC,
1085       DL,
1086       MVT::i1,
1087       Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
1088       DAG.getCondCode(ISD::SETNE)
1089       );
1090 }
1091
1092 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1093                                                    SDLoc DL,
1094                                                    unsigned DwordOffset) const {
1095   unsigned ByteOffset = DwordOffset * 4;
1096   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1097                                       AMDGPUAS::CONSTANT_BUFFER_0);
1098
1099   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1100   assert(isInt<16>(ByteOffset));
1101
1102   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1103                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1104                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1105                      false, false, false, 0);
1106 }
1107
1108 bool R600TargetLowering::isZero(SDValue Op) const {
1109   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1110     return Cst->isNullValue();
1111   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1112     return CstFP->isZero();
1113   } else {
1114     return false;
1115   }
1116 }
1117
1118 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1119   SDLoc DL(Op);
1120   EVT VT = Op.getValueType();
1121
1122   SDValue LHS = Op.getOperand(0);
1123   SDValue RHS = Op.getOperand(1);
1124   SDValue True = Op.getOperand(2);
1125   SDValue False = Op.getOperand(3);
1126   SDValue CC = Op.getOperand(4);
1127   SDValue Temp;
1128
1129   if (VT == MVT::f32) {
1130     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1131     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1132     if (MinMax)
1133       return MinMax;
1134   }
1135
1136   // LHS and RHS are guaranteed to be the same value type
1137   EVT CompareVT = LHS.getValueType();
1138
1139   // Check if we can lower this to a native operation.
1140
1141   // Try to lower to a SET* instruction:
1142   //
1143   // SET* can match the following patterns:
1144   //
1145   // select_cc f32, f32, -1,  0, cc_supported
1146   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1147   // select_cc i32, i32, -1,  0, cc_supported
1148   //
1149
1150   // Move hardware True/False values to the correct operand.
1151   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1152   ISD::CondCode InverseCC =
1153      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1154   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1155     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1156       std::swap(False, True);
1157       CC = DAG.getCondCode(InverseCC);
1158     } else {
1159       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1160       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1161         std::swap(False, True);
1162         std::swap(LHS, RHS);
1163         CC = DAG.getCondCode(SwapInvCC);
1164       }
1165     }
1166   }
1167
1168   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1169       (CompareVT == VT || VT == MVT::i32)) {
1170     // This can be matched by a SET* instruction.
1171     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1172   }
1173
1174   // Try to lower to a CND* instruction:
1175   //
1176   // CND* can match the following patterns:
1177   //
1178   // select_cc f32, 0.0, f32, f32, cc_supported
1179   // select_cc f32, 0.0, i32, i32, cc_supported
1180   // select_cc i32, 0,   f32, f32, cc_supported
1181   // select_cc i32, 0,   i32, i32, cc_supported
1182   //
1183
1184   // Try to move the zero value to the RHS
1185   if (isZero(LHS)) {
1186     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1187     // Try swapping the operands
1188     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1189     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1190       std::swap(LHS, RHS);
1191       CC = DAG.getCondCode(CCSwapped);
1192     } else {
1193       // Try inverting the conditon and then swapping the operands
1194       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1195       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1196       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1197         std::swap(True, False);
1198         std::swap(LHS, RHS);
1199         CC = DAG.getCondCode(CCSwapped);
1200       }
1201     }
1202   }
1203   if (isZero(RHS)) {
1204     SDValue Cond = LHS;
1205     SDValue Zero = RHS;
1206     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1207     if (CompareVT != VT) {
1208       // Bitcast True / False to the correct types.  This will end up being
1209       // a nop, but it allows us to define only a single pattern in the
1210       // .TD files for each CND* instruction rather than having to have
1211       // one pattern for integer True/False and one for fp True/False
1212       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1213       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1214     }
1215
1216     switch (CCOpcode) {
1217     case ISD::SETONE:
1218     case ISD::SETUNE:
1219     case ISD::SETNE:
1220       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1221       Temp = True;
1222       True = False;
1223       False = Temp;
1224       break;
1225     default:
1226       break;
1227     }
1228     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1229         Cond, Zero,
1230         True, False,
1231         DAG.getCondCode(CCOpcode));
1232     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1233   }
1234
1235   // If we make it this for it means we have no native instructions to handle
1236   // this SELECT_CC, so we must lower it.
1237   SDValue HWTrue, HWFalse;
1238
1239   if (CompareVT == MVT::f32) {
1240     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1241     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1242   } else if (CompareVT == MVT::i32) {
1243     HWTrue = DAG.getConstant(-1, DL, CompareVT);
1244     HWFalse = DAG.getConstant(0, DL, CompareVT);
1245   }
1246   else {
1247     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1248   }
1249
1250   // Lower this unsupported SELECT_CC into a combination of two supported
1251   // SELECT_CC operations.
1252   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1253
1254   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1255       Cond, HWFalse,
1256       True, False,
1257       DAG.getCondCode(ISD::SETNE));
1258 }
1259
1260 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1261 /// convert these pointers to a register index.  Each register holds
1262 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1263 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1264 /// for indirect addressing.
1265 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1266                                                unsigned StackWidth,
1267                                                SelectionDAG &DAG) const {
1268   unsigned SRLPad;
1269   switch(StackWidth) {
1270   case 1:
1271     SRLPad = 2;
1272     break;
1273   case 2:
1274     SRLPad = 3;
1275     break;
1276   case 4:
1277     SRLPad = 4;
1278     break;
1279   default: llvm_unreachable("Invalid stack width");
1280   }
1281
1282   SDLoc DL(Ptr);
1283   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1284                      DAG.getConstant(SRLPad, DL, MVT::i32));
1285 }
1286
1287 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1288                                          unsigned ElemIdx,
1289                                          unsigned &Channel,
1290                                          unsigned &PtrIncr) const {
1291   switch (StackWidth) {
1292   default:
1293   case 1:
1294     Channel = 0;
1295     if (ElemIdx > 0) {
1296       PtrIncr = 1;
1297     } else {
1298       PtrIncr = 0;
1299     }
1300     break;
1301   case 2:
1302     Channel = ElemIdx % 2;
1303     if (ElemIdx == 2) {
1304       PtrIncr = 1;
1305     } else {
1306       PtrIncr = 0;
1307     }
1308     break;
1309   case 4:
1310     Channel = ElemIdx;
1311     PtrIncr = 0;
1312     break;
1313   }
1314 }
1315
1316 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1317   SDLoc DL(Op);
1318   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1319   SDValue Chain = Op.getOperand(0);
1320   SDValue Value = Op.getOperand(1);
1321   SDValue Ptr = Op.getOperand(2);
1322
1323   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1324   if (Result.getNode()) {
1325     return Result;
1326   }
1327
1328   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1329     if (StoreNode->isTruncatingStore()) {
1330       EVT VT = Value.getValueType();
1331       assert(VT.bitsLE(MVT::i32));
1332       EVT MemVT = StoreNode->getMemoryVT();
1333       SDValue MaskConstant;
1334       if (MemVT == MVT::i8) {
1335         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1336       } else {
1337         assert(MemVT == MVT::i16);
1338         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1339       }
1340       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1341                                       DAG.getConstant(2, DL, MVT::i32));
1342       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1343                                       DAG.getConstant(0x00000003, DL, VT));
1344       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1345       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1346                                    DAG.getConstant(3, DL, VT));
1347       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1348       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1349       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1350       // vector instead.
1351       SDValue Src[4] = {
1352         ShiftedValue,
1353         DAG.getConstant(0, DL, MVT::i32),
1354         DAG.getConstant(0, DL, MVT::i32),
1355         Mask
1356       };
1357       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1358       SDValue Args[3] = { Chain, Input, DWordAddr };
1359       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1360                                      Op->getVTList(), Args, MemVT,
1361                                      StoreNode->getMemOperand());
1362     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1363                Value.getValueType().bitsGE(MVT::i32)) {
1364       // Convert pointer from byte address to dword address.
1365       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1366                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1367                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
1368
1369       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1370         llvm_unreachable("Truncated and indexed stores not supported yet");
1371       } else {
1372         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1373       }
1374       return Chain;
1375     }
1376   }
1377
1378   EVT ValueVT = Value.getValueType();
1379
1380   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1381     return SDValue();
1382   }
1383
1384   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1385   if (Ret.getNode()) {
1386     return Ret;
1387   }
1388   // Lowering for indirect addressing
1389
1390   const MachineFunction &MF = DAG.getMachineFunction();
1391   const AMDGPUFrameLowering *TFL =
1392       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1393   unsigned StackWidth = TFL->getStackWidth(MF);
1394
1395   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1396
1397   if (ValueVT.isVector()) {
1398     unsigned NumElemVT = ValueVT.getVectorNumElements();
1399     EVT ElemVT = ValueVT.getVectorElementType();
1400     SmallVector<SDValue, 4> Stores(NumElemVT);
1401
1402     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1403                                       "vector width in load");
1404
1405     for (unsigned i = 0; i < NumElemVT; ++i) {
1406       unsigned Channel, PtrIncr;
1407       getStackAddress(StackWidth, i, Channel, PtrIncr);
1408       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1409                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1410       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1411                                  Value, DAG.getConstant(i, DL, MVT::i32));
1412
1413       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1414                               Chain, Elem, Ptr,
1415                               DAG.getTargetConstant(Channel, DL, MVT::i32));
1416     }
1417      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1418    } else {
1419     if (ValueVT == MVT::i8) {
1420       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1421     }
1422     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1423     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1424   }
1425
1426   return Chain;
1427 }
1428
1429 // return (512 + (kc_bank << 12)
1430 static int
1431 ConstantAddressBlock(unsigned AddressSpace) {
1432   switch (AddressSpace) {
1433   case AMDGPUAS::CONSTANT_BUFFER_0:
1434     return 512;
1435   case AMDGPUAS::CONSTANT_BUFFER_1:
1436     return 512 + 4096;
1437   case AMDGPUAS::CONSTANT_BUFFER_2:
1438     return 512 + 4096 * 2;
1439   case AMDGPUAS::CONSTANT_BUFFER_3:
1440     return 512 + 4096 * 3;
1441   case AMDGPUAS::CONSTANT_BUFFER_4:
1442     return 512 + 4096 * 4;
1443   case AMDGPUAS::CONSTANT_BUFFER_5:
1444     return 512 + 4096 * 5;
1445   case AMDGPUAS::CONSTANT_BUFFER_6:
1446     return 512 + 4096 * 6;
1447   case AMDGPUAS::CONSTANT_BUFFER_7:
1448     return 512 + 4096 * 7;
1449   case AMDGPUAS::CONSTANT_BUFFER_8:
1450     return 512 + 4096 * 8;
1451   case AMDGPUAS::CONSTANT_BUFFER_9:
1452     return 512 + 4096 * 9;
1453   case AMDGPUAS::CONSTANT_BUFFER_10:
1454     return 512 + 4096 * 10;
1455   case AMDGPUAS::CONSTANT_BUFFER_11:
1456     return 512 + 4096 * 11;
1457   case AMDGPUAS::CONSTANT_BUFFER_12:
1458     return 512 + 4096 * 12;
1459   case AMDGPUAS::CONSTANT_BUFFER_13:
1460     return 512 + 4096 * 13;
1461   case AMDGPUAS::CONSTANT_BUFFER_14:
1462     return 512 + 4096 * 14;
1463   case AMDGPUAS::CONSTANT_BUFFER_15:
1464     return 512 + 4096 * 15;
1465   default:
1466     return -1;
1467   }
1468 }
1469
1470 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1471 {
1472   EVT VT = Op.getValueType();
1473   SDLoc DL(Op);
1474   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1475   SDValue Chain = Op.getOperand(0);
1476   SDValue Ptr = Op.getOperand(1);
1477   SDValue LoweredLoad;
1478
1479   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1480   if (Ret.getNode()) {
1481     SDValue Ops[2] = {
1482       Ret,
1483       Chain
1484     };
1485     return DAG.getMergeValues(Ops, DL);
1486   }
1487
1488   // Lower loads constant address space global variable loads
1489   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1490       isa<GlobalVariable>(GetUnderlyingObject(
1491           LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
1492
1493     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1494         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1495     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1496         DAG.getConstant(2, DL, MVT::i32));
1497     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1498                        LoadNode->getChain(), Ptr,
1499                        DAG.getTargetConstant(0, DL, MVT::i32),
1500                        Op.getOperand(2));
1501   }
1502
1503   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1504     SDValue MergedValues[2] = {
1505       ScalarizeVectorLoad(Op, DAG),
1506       Chain
1507     };
1508     return DAG.getMergeValues(MergedValues, DL);
1509   }
1510
1511   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1512   if (ConstantBlock > -1 &&
1513       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1514        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1515     SDValue Result;
1516     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1517         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1518         isa<ConstantSDNode>(Ptr)) {
1519       SDValue Slots[4];
1520       for (unsigned i = 0; i < 4; i++) {
1521         // We want Const position encoded with the following formula :
1522         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1523         // const_index is Ptr computed by llvm using an alignment of 16.
1524         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1525         // then div by 4 at the ISel step
1526         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1527             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1528         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1529       }
1530       EVT NewVT = MVT::v4i32;
1531       unsigned NumElements = 4;
1532       if (VT.isVector()) {
1533         NewVT = VT;
1534         NumElements = VT.getVectorNumElements();
1535       }
1536       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1537                            makeArrayRef(Slots, NumElements));
1538     } else {
1539       // non-constant ptr can't be folded, keeps it as a v4f32 load
1540       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1541           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1542                       DAG.getConstant(4, DL, MVT::i32)),
1543                       DAG.getConstant(LoadNode->getAddressSpace() -
1544                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1545           );
1546     }
1547
1548     if (!VT.isVector()) {
1549       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1550                            DAG.getConstant(0, DL, MVT::i32));
1551     }
1552
1553     SDValue MergedValues[2] = {
1554       Result,
1555       Chain
1556     };
1557     return DAG.getMergeValues(MergedValues, DL);
1558   }
1559
1560   // For most operations returning SDValue() will result in the node being
1561   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1562   // need to manually expand loads that may be legal in some address spaces and
1563   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1564   // compute shaders, since the data is sign extended when it is uploaded to the
1565   // buffer. However SEXT loads from other address spaces are not supported, so
1566   // we need to expand them here.
1567   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1568     EVT MemVT = LoadNode->getMemoryVT();
1569     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1570     SDValue ShiftAmount =
1571           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), DL,
1572                           MVT::i32);
1573     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1574                                   LoadNode->getPointerInfo(), MemVT,
1575                                   LoadNode->isVolatile(),
1576                                   LoadNode->isNonTemporal(),
1577                                   LoadNode->isInvariant(),
1578                                   LoadNode->getAlignment());
1579     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1580     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1581
1582     SDValue MergedValues[2] = { Sra, Chain };
1583     return DAG.getMergeValues(MergedValues, DL);
1584   }
1585
1586   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1587     return SDValue();
1588   }
1589
1590   // Lowering for indirect addressing
1591   const MachineFunction &MF = DAG.getMachineFunction();
1592   const AMDGPUFrameLowering *TFL =
1593       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1594   unsigned StackWidth = TFL->getStackWidth(MF);
1595
1596   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1597
1598   if (VT.isVector()) {
1599     unsigned NumElemVT = VT.getVectorNumElements();
1600     EVT ElemVT = VT.getVectorElementType();
1601     SDValue Loads[4];
1602
1603     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1604                                       "vector width in load");
1605
1606     for (unsigned i = 0; i < NumElemVT; ++i) {
1607       unsigned Channel, PtrIncr;
1608       getStackAddress(StackWidth, i, Channel, PtrIncr);
1609       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1610                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1611       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1612                              Chain, Ptr,
1613                              DAG.getTargetConstant(Channel, DL, MVT::i32),
1614                              Op.getOperand(2));
1615     }
1616     for (unsigned i = NumElemVT; i < 4; ++i) {
1617       Loads[i] = DAG.getUNDEF(ElemVT);
1618     }
1619     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1620     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1621   } else {
1622     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1623                               Chain, Ptr,
1624                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1625                               Op.getOperand(2));
1626   }
1627
1628   SDValue Ops[2] = {
1629     LoweredLoad,
1630     Chain
1631   };
1632
1633   return DAG.getMergeValues(Ops, DL);
1634 }
1635
1636 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1637   SDValue Chain = Op.getOperand(0);
1638   SDValue Cond  = Op.getOperand(1);
1639   SDValue Jump  = Op.getOperand(2);
1640
1641   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1642                      Chain, Jump, Cond);
1643 }
1644
1645 /// XXX Only kernel functions are supported, so we can assume for now that
1646 /// every function is a kernel function, but in the future we should use
1647 /// separate calling conventions for kernel and non-kernel functions.
1648 SDValue R600TargetLowering::LowerFormalArguments(
1649                                       SDValue Chain,
1650                                       CallingConv::ID CallConv,
1651                                       bool isVarArg,
1652                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1653                                       SDLoc DL, SelectionDAG &DAG,
1654                                       SmallVectorImpl<SDValue> &InVals) const {
1655   SmallVector<CCValAssign, 16> ArgLocs;
1656   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1657                  *DAG.getContext());
1658   MachineFunction &MF = DAG.getMachineFunction();
1659   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1660
1661   SmallVector<ISD::InputArg, 8> LocalIns;
1662
1663   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1664
1665   AnalyzeFormalArguments(CCInfo, LocalIns);
1666
1667   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1668     CCValAssign &VA = ArgLocs[i];
1669     const ISD::InputArg &In = Ins[i];
1670     EVT VT = In.VT;
1671     EVT MemVT = VA.getLocVT();
1672     if (!VT.isVector() && MemVT.isVector()) {
1673       // Get load source type if scalarized.
1674       MemVT = MemVT.getVectorElementType();
1675     }
1676
1677     if (MFI->getShaderType() != ShaderType::COMPUTE) {
1678       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1679       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1680       InVals.push_back(Register);
1681       continue;
1682     }
1683
1684     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1685                                           AMDGPUAS::CONSTANT_BUFFER_0);
1686
1687     // i64 isn't a legal type, so the register type used ends up as i32, which
1688     // isn't expected here. It attempts to create this sextload, but it ends up
1689     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1690     // for <1 x i64>.
1691
1692     // The first 36 bytes of the input buffer contains information about
1693     // thread group and global sizes.
1694     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1695     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1696       // FIXME: This should really check the extload type, but the handling of
1697       // extload vector parameters seems to be broken.
1698
1699       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1700       Ext = ISD::SEXTLOAD;
1701     }
1702
1703     // Compute the offset from the value.
1704     // XXX - I think PartOffset should give you this, but it seems to give the
1705     // size of the register which isn't useful.
1706
1707     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1708     unsigned PartOffset = VA.getLocMemOffset();
1709     unsigned Offset = 36 + VA.getLocMemOffset();
1710
1711     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1712     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1713                               DAG.getConstant(Offset, DL, MVT::i32),
1714                               DAG.getUNDEF(MVT::i32),
1715                               PtrInfo,
1716                               MemVT, false, true, true, 4);
1717
1718     // 4 is the preferred alignment for the CONSTANT memory space.
1719     InVals.push_back(Arg);
1720     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1721   }
1722   return Chain;
1723 }
1724
1725 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1726    if (!VT.isVector())
1727      return MVT::i32;
1728    return VT.changeVectorElementTypeToInteger();
1729 }
1730
1731 static SDValue CompactSwizzlableVector(
1732   SelectionDAG &DAG, SDValue VectorEntry,
1733   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1734   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1735   assert(RemapSwizzle.empty());
1736   SDValue NewBldVec[4] = {
1737     VectorEntry.getOperand(0),
1738     VectorEntry.getOperand(1),
1739     VectorEntry.getOperand(2),
1740     VectorEntry.getOperand(3)
1741   };
1742
1743   for (unsigned i = 0; i < 4; i++) {
1744     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1745       // We mask write here to teach later passes that the ith element of this
1746       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1747       // break false dependencies and additionnaly make assembly easier to read.
1748       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1749     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1750       if (C->isZero()) {
1751         RemapSwizzle[i] = 4; // SEL_0
1752         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1753       } else if (C->isExactlyValue(1.0)) {
1754         RemapSwizzle[i] = 5; // SEL_1
1755         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1756       }
1757     }
1758
1759     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1760       continue;
1761     for (unsigned j = 0; j < i; j++) {
1762       if (NewBldVec[i] == NewBldVec[j]) {
1763         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1764         RemapSwizzle[i] = j;
1765         break;
1766       }
1767     }
1768   }
1769
1770   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1771                      VectorEntry.getValueType(), NewBldVec);
1772 }
1773
1774 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1775                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1776   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1777   assert(RemapSwizzle.empty());
1778   SDValue NewBldVec[4] = {
1779       VectorEntry.getOperand(0),
1780       VectorEntry.getOperand(1),
1781       VectorEntry.getOperand(2),
1782       VectorEntry.getOperand(3)
1783   };
1784   bool isUnmovable[4] = { false, false, false, false };
1785   for (unsigned i = 0; i < 4; i++) {
1786     RemapSwizzle[i] = i;
1787     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1788       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1789           ->getZExtValue();
1790       if (i == Idx)
1791         isUnmovable[Idx] = true;
1792     }
1793   }
1794
1795   for (unsigned i = 0; i < 4; i++) {
1796     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1797       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1798           ->getZExtValue();
1799       if (isUnmovable[Idx])
1800         continue;
1801       // Swap i and Idx
1802       std::swap(NewBldVec[Idx], NewBldVec[i]);
1803       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1804       break;
1805     }
1806   }
1807
1808   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1809                      VectorEntry.getValueType(), NewBldVec);
1810 }
1811
1812
1813 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1814                                             SDValue Swz[4], SelectionDAG &DAG,
1815                                             SDLoc DL) const {
1816   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1817   // Old -> New swizzle values
1818   DenseMap<unsigned, unsigned> SwizzleRemap;
1819
1820   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1821   for (unsigned i = 0; i < 4; i++) {
1822     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1823     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1824       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1825   }
1826
1827   SwizzleRemap.clear();
1828   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1829   for (unsigned i = 0; i < 4; i++) {
1830     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1831     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1832       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1833   }
1834
1835   return BuildVector;
1836 }
1837
1838
1839 //===----------------------------------------------------------------------===//
1840 // Custom DAG Optimizations
1841 //===----------------------------------------------------------------------===//
1842
1843 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1844                                               DAGCombinerInfo &DCI) const {
1845   SelectionDAG &DAG = DCI.DAG;
1846
1847   switch (N->getOpcode()) {
1848   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1849   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1850   case ISD::FP_ROUND: {
1851       SDValue Arg = N->getOperand(0);
1852       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1853         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1854                            Arg.getOperand(0));
1855       }
1856       break;
1857     }
1858
1859   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1860   // (i32 select_cc f32, f32, -1, 0 cc)
1861   //
1862   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1863   // this to one of the SET*_DX10 instructions.
1864   case ISD::FP_TO_SINT: {
1865     SDValue FNeg = N->getOperand(0);
1866     if (FNeg.getOpcode() != ISD::FNEG) {
1867       return SDValue();
1868     }
1869     SDValue SelectCC = FNeg.getOperand(0);
1870     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1871         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1872         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1873         !isHWTrueValue(SelectCC.getOperand(2)) ||
1874         !isHWFalseValue(SelectCC.getOperand(3))) {
1875       return SDValue();
1876     }
1877
1878     SDLoc dl(N);
1879     return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1880                            SelectCC.getOperand(0), // LHS
1881                            SelectCC.getOperand(1), // RHS
1882                            DAG.getConstant(-1, dl, MVT::i32), // True
1883                            DAG.getConstant(0, dl, MVT::i32),  // False
1884                            SelectCC.getOperand(4)); // CC
1885
1886     break;
1887   }
1888
1889   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1890   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1891   case ISD::INSERT_VECTOR_ELT: {
1892     SDValue InVec = N->getOperand(0);
1893     SDValue InVal = N->getOperand(1);
1894     SDValue EltNo = N->getOperand(2);
1895     SDLoc dl(N);
1896
1897     // If the inserted element is an UNDEF, just use the input vector.
1898     if (InVal.getOpcode() == ISD::UNDEF)
1899       return InVec;
1900
1901     EVT VT = InVec.getValueType();
1902
1903     // If we can't generate a legal BUILD_VECTOR, exit
1904     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1905       return SDValue();
1906
1907     // Check that we know which element is being inserted
1908     if (!isa<ConstantSDNode>(EltNo))
1909       return SDValue();
1910     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1911
1912     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1913     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1914     // vector elements.
1915     SmallVector<SDValue, 8> Ops;
1916     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1917       Ops.append(InVec.getNode()->op_begin(),
1918                  InVec.getNode()->op_end());
1919     } else if (InVec.getOpcode() == ISD::UNDEF) {
1920       unsigned NElts = VT.getVectorNumElements();
1921       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1922     } else {
1923       return SDValue();
1924     }
1925
1926     // Insert the element
1927     if (Elt < Ops.size()) {
1928       // All the operands of BUILD_VECTOR must have the same type;
1929       // we enforce that here.
1930       EVT OpVT = Ops[0].getValueType();
1931       if (InVal.getValueType() != OpVT)
1932         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1933           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1934           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1935       Ops[Elt] = InVal;
1936     }
1937
1938     // Return the new vector
1939     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1940   }
1941
1942   // Extract_vec (Build_vector) generated by custom lowering
1943   // also needs to be customly combined
1944   case ISD::EXTRACT_VECTOR_ELT: {
1945     SDValue Arg = N->getOperand(0);
1946     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1947       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1948         unsigned Element = Const->getZExtValue();
1949         return Arg->getOperand(Element);
1950       }
1951     }
1952     if (Arg.getOpcode() == ISD::BITCAST &&
1953         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1954       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1955         unsigned Element = Const->getZExtValue();
1956         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1957             Arg->getOperand(0).getOperand(Element));
1958       }
1959     }
1960   }
1961
1962   case ISD::SELECT_CC: {
1963     // Try common optimizations
1964     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1965     if (Ret.getNode())
1966       return Ret;
1967
1968     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1969     //      selectcc x, y, a, b, inv(cc)
1970     //
1971     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1972     //      selectcc x, y, a, b, cc
1973     SDValue LHS = N->getOperand(0);
1974     if (LHS.getOpcode() != ISD::SELECT_CC) {
1975       return SDValue();
1976     }
1977
1978     SDValue RHS = N->getOperand(1);
1979     SDValue True = N->getOperand(2);
1980     SDValue False = N->getOperand(3);
1981     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1982
1983     if (LHS.getOperand(2).getNode() != True.getNode() ||
1984         LHS.getOperand(3).getNode() != False.getNode() ||
1985         RHS.getNode() != False.getNode()) {
1986       return SDValue();
1987     }
1988
1989     switch (NCC) {
1990     default: return SDValue();
1991     case ISD::SETNE: return LHS;
1992     case ISD::SETEQ: {
1993       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1994       LHSCC = ISD::getSetCCInverse(LHSCC,
1995                                   LHS.getOperand(0).getValueType().isInteger());
1996       if (DCI.isBeforeLegalizeOps() ||
1997           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1998         return DAG.getSelectCC(SDLoc(N),
1999                                LHS.getOperand(0),
2000                                LHS.getOperand(1),
2001                                LHS.getOperand(2),
2002                                LHS.getOperand(3),
2003                                LHSCC);
2004       break;
2005     }
2006     }
2007     return SDValue();
2008   }
2009
2010   case AMDGPUISD::EXPORT: {
2011     SDValue Arg = N->getOperand(1);
2012     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2013       break;
2014
2015     SDValue NewArgs[8] = {
2016       N->getOperand(0), // Chain
2017       SDValue(),
2018       N->getOperand(2), // ArrayBase
2019       N->getOperand(3), // Type
2020       N->getOperand(4), // SWZ_X
2021       N->getOperand(5), // SWZ_Y
2022       N->getOperand(6), // SWZ_Z
2023       N->getOperand(7) // SWZ_W
2024     };
2025     SDLoc DL(N);
2026     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2027     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2028   }
2029   case AMDGPUISD::TEXTURE_FETCH: {
2030     SDValue Arg = N->getOperand(1);
2031     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2032       break;
2033
2034     SDValue NewArgs[19] = {
2035       N->getOperand(0),
2036       N->getOperand(1),
2037       N->getOperand(2),
2038       N->getOperand(3),
2039       N->getOperand(4),
2040       N->getOperand(5),
2041       N->getOperand(6),
2042       N->getOperand(7),
2043       N->getOperand(8),
2044       N->getOperand(9),
2045       N->getOperand(10),
2046       N->getOperand(11),
2047       N->getOperand(12),
2048       N->getOperand(13),
2049       N->getOperand(14),
2050       N->getOperand(15),
2051       N->getOperand(16),
2052       N->getOperand(17),
2053       N->getOperand(18),
2054     };
2055     SDLoc DL(N);
2056     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2057     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2058   }
2059   }
2060
2061   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2062 }
2063
2064 static bool
2065 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2066             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2067   const R600InstrInfo *TII =
2068       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2069   if (!Src.isMachineOpcode())
2070     return false;
2071   switch (Src.getMachineOpcode()) {
2072   case AMDGPU::FNEG_R600:
2073     if (!Neg.getNode())
2074       return false;
2075     Src = Src.getOperand(0);
2076     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2077     return true;
2078   case AMDGPU::FABS_R600:
2079     if (!Abs.getNode())
2080       return false;
2081     Src = Src.getOperand(0);
2082     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2083     return true;
2084   case AMDGPU::CONST_COPY: {
2085     unsigned Opcode = ParentNode->getMachineOpcode();
2086     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2087
2088     if (!Sel.getNode())
2089       return false;
2090
2091     SDValue CstOffset = Src.getOperand(0);
2092     if (ParentNode->getValueType(0).isVector())
2093       return false;
2094
2095     // Gather constants values
2096     int SrcIndices[] = {
2097       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2098       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2099       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2100       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2101       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2102       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2103       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2104       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2105       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2106       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2107       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2108     };
2109     std::vector<unsigned> Consts;
2110     for (int OtherSrcIdx : SrcIndices) {
2111       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2112       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2113         continue;
2114       if (HasDst) {
2115         OtherSrcIdx--;
2116         OtherSelIdx--;
2117       }
2118       if (RegisterSDNode *Reg =
2119           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2120         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2121           ConstantSDNode *Cst
2122             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2123           Consts.push_back(Cst->getZExtValue());
2124         }
2125       }
2126     }
2127
2128     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2129     Consts.push_back(Cst->getZExtValue());
2130     if (!TII->fitsConstReadLimitations(Consts)) {
2131       return false;
2132     }
2133
2134     Sel = CstOffset;
2135     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2136     return true;
2137   }
2138   case AMDGPU::MOV_IMM_I32:
2139   case AMDGPU::MOV_IMM_F32: {
2140     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2141     uint64_t ImmValue = 0;
2142
2143
2144     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2145       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2146       float FloatValue = FPC->getValueAPF().convertToFloat();
2147       if (FloatValue == 0.0) {
2148         ImmReg = AMDGPU::ZERO;
2149       } else if (FloatValue == 0.5) {
2150         ImmReg = AMDGPU::HALF;
2151       } else if (FloatValue == 1.0) {
2152         ImmReg = AMDGPU::ONE;
2153       } else {
2154         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2155       }
2156     } else {
2157       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2158       uint64_t Value = C->getZExtValue();
2159       if (Value == 0) {
2160         ImmReg = AMDGPU::ZERO;
2161       } else if (Value == 1) {
2162         ImmReg = AMDGPU::ONE_INT;
2163       } else {
2164         ImmValue = Value;
2165       }
2166     }
2167
2168     // Check that we aren't already using an immediate.
2169     // XXX: It's possible for an instruction to have more than one
2170     // immediate operand, but this is not supported yet.
2171     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2172       if (!Imm.getNode())
2173         return false;
2174       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2175       assert(C);
2176       if (C->getZExtValue())
2177         return false;
2178       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2179     }
2180     Src = DAG.getRegister(ImmReg, MVT::i32);
2181     return true;
2182   }
2183   default:
2184     return false;
2185   }
2186 }
2187
2188
2189 /// \brief Fold the instructions after selecting them
2190 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2191                                             SelectionDAG &DAG) const {
2192   const R600InstrInfo *TII =
2193       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2194   if (!Node->isMachineOpcode())
2195     return Node;
2196   unsigned Opcode = Node->getMachineOpcode();
2197   SDValue FakeOp;
2198
2199   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2200
2201   if (Opcode == AMDGPU::DOT_4) {
2202     int OperandIdx[] = {
2203       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2211         };
2212     int NegIdx[] = {
2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2214       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2215       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2216       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2218       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2219       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2220       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2221     };
2222     int AbsIdx[] = {
2223       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2224       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2225       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2226       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2227       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2228       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2229       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2230       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2231     };
2232     for (unsigned i = 0; i < 8; i++) {
2233       if (OperandIdx[i] < 0)
2234         return Node;
2235       SDValue &Src = Ops[OperandIdx[i] - 1];
2236       SDValue &Neg = Ops[NegIdx[i] - 1];
2237       SDValue &Abs = Ops[AbsIdx[i] - 1];
2238       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2239       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2240       if (HasDst)
2241         SelIdx--;
2242       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2243       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2244         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2245     }
2246   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2247     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2248       SDValue &Src = Ops[i];
2249       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2250         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2251     }
2252   } else if (Opcode == AMDGPU::CLAMP_R600) {
2253     SDValue Src = Node->getOperand(0);
2254     if (!Src.isMachineOpcode() ||
2255         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2256       return Node;
2257     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2258         AMDGPU::OpName::clamp);
2259     if (ClampIdx < 0)
2260       return Node;
2261     SDLoc DL(Node);
2262     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2263     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2264     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2265                               Node->getVTList(), Ops);
2266   } else {
2267     if (!TII->hasInstrModifiers(Opcode))
2268       return Node;
2269     int OperandIdx[] = {
2270       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2271       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2272       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2273     };
2274     int NegIdx[] = {
2275       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2276       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2277       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2278     };
2279     int AbsIdx[] = {
2280       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2281       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2282       -1
2283     };
2284     for (unsigned i = 0; i < 3; i++) {
2285       if (OperandIdx[i] < 0)
2286         return Node;
2287       SDValue &Src = Ops[OperandIdx[i] - 1];
2288       SDValue &Neg = Ops[NegIdx[i] - 1];
2289       SDValue FakeAbs;
2290       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2291       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2292       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2293       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2294       if (HasDst) {
2295         SelIdx--;
2296         ImmIdx--;
2297       }
2298       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2299       SDValue &Imm = Ops[ImmIdx];
2300       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2301         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2302     }
2303   }
2304
2305   return Node;
2306 }