lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  34     AMDGPUTargetLowering(TM),
  35     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties();
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // Expand sign extension of vectors
  95   if (!Subtarget->hasBFE())
  96     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  97
  98   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 100
 101   if (!Subtarget->hasBFE())
 102     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 105
 106   if (!Subtarget->hasBFE())
 107     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 110
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 114
 115   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 116
 117
 118   // Legalize loads and stores to the private address space.
 119   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 120   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 121   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 122
 123   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 124   // spaces, so it is custom lowered to handle those where it isn't.
 125   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
 126   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
 127   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
 128   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
 129   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
 130   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
 131
 132   setOperationAction(ISD::STORE, MVT::i8, Custom);
 133   setOperationAction(ISD::STORE, MVT::i32, Custom);
 134   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 135   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 136   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 137   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 138
 139   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 140   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 141   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 142
 143   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 144   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 145   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 146   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 147
 148   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 149   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 150   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 151   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 152
 153   setTargetDAGCombine(ISD::FP_ROUND);
 154   setTargetDAGCombine(ISD::FP_TO_SINT);
 155   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 156   setTargetDAGCombine(ISD::SELECT_CC);
 157   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 158
 159   setOperationAction(ISD::SUB, MVT::i64, Expand);
 160
 161   // These should be replaced by UDVIREM, but it does not happen automatically
 162   // during Type Legalization
 163   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 164   setOperationAction(ISD::UREM, MVT::i64, Custom);
 165   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 166   setOperationAction(ISD::SREM, MVT::i64, Custom);
 167
 168   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 169   //  to be Legal/Custom in order to avoid library calls.
 170   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 171   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 172   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 173
 174   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 175
 176   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 177   for (MVT VT : ScalarIntVTs) {
 178     setOperationAction(ISD::ADDC, VT, Expand);
 179     setOperationAction(ISD::SUBC, VT, Expand);
 180     setOperationAction(ISD::ADDE, VT, Expand);
 181     setOperationAction(ISD::SUBE, VT, Expand);
 182   }
 183
 184   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 185   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 186   setSchedulingPreference(Sched::Source);
 187 }
 188
 189 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 190     MachineInstr * MI, MachineBasicBlock * BB) const {
 191   MachineFunction * MF = BB->getParent();
 192   MachineRegisterInfo &MRI = MF->getRegInfo();
 193   MachineBasicBlock::iterator I = *MI;
 194   const R600InstrInfo *TII =
 195       static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
 196
 197   switch (MI->getOpcode()) {
 198   default:
 199     // Replace LDS_*_RET instruction that don't have any uses with the
 200     // equivalent LDS_*_NORET instruction.
 201     if (TII->isLDSRetInstr(MI->getOpcode())) {
 202       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 203       assert(DstIdx != -1);
 204       MachineInstrBuilder NewMI;
 205       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 206       //        LDS_1A2D support and remove this special case.
 207       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
 208            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
 209         return BB;
 210
 211       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 212                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 213       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 214         NewMI.addOperand(MI->getOperand(i));
 215       }
 216     } else {
 217       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 218     }
 219     break;
 220   case AMDGPU::CLAMP_R600: {
 221     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 222                                                    AMDGPU::MOV,
 223                                                    MI->getOperand(0).getReg(),
 224                                                    MI->getOperand(1).getReg());
 225     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 226     break;
 227   }
 228
 229   case AMDGPU::FABS_R600: {
 230     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 231                                                     AMDGPU::MOV,
 232                                                     MI->getOperand(0).getReg(),
 233                                                     MI->getOperand(1).getReg());
 234     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 235     break;
 236   }
 237
 238   case AMDGPU::FNEG_R600: {
 239     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 240                                                     AMDGPU::MOV,
 241                                                     MI->getOperand(0).getReg(),
 242                                                     MI->getOperand(1).getReg());
 243     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 244     break;
 245   }
 246
 247   case AMDGPU::MASK_WRITE: {
 248     unsigned maskedRegister = MI->getOperand(0).getReg();
 249     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 250     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 251     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 252     break;
 253   }
 254
 255   case AMDGPU::MOV_IMM_F32:
 256     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 257                      MI->getOperand(1).getFPImm()->getValueAPF()
 258                          .bitcastToAPInt().getZExtValue());
 259     break;
 260   case AMDGPU::MOV_IMM_I32:
 261     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 262                      MI->getOperand(1).getImm());
 263     break;
 264   case AMDGPU::CONST_COPY: {
 265     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 266         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 267     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 268         MI->getOperand(1).getImm());
 269     break;
 270   }
 271
 272   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 273   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 274   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 275     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 276
 277     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 278             .addOperand(MI->getOperand(0))
 279             .addOperand(MI->getOperand(1))
 280             .addImm(EOP); // Set End of program bit
 281     break;
 282   }
 283
 284   case AMDGPU::TXD: {
 285     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 286     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 287     MachineOperand &RID = MI->getOperand(4);
 288     MachineOperand &SID = MI->getOperand(5);
 289     unsigned TextureId = MI->getOperand(6).getImm();
 290     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 291     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 292
 293     switch (TextureId) {
 294     case 5: // Rect
 295       CTX = CTY = 0;
 296       break;
 297     case 6: // Shadow1D
 298       SrcW = SrcZ;
 299       break;
 300     case 7: // Shadow2D
 301       SrcW = SrcZ;
 302       break;
 303     case 8: // ShadowRect
 304       CTX = CTY = 0;
 305       SrcW = SrcZ;
 306       break;
 307     case 9: // 1DArray
 308       SrcZ = SrcY;
 309       CTZ = 0;
 310       break;
 311     case 10: // 2DArray
 312       CTZ = 0;
 313       break;
 314     case 11: // Shadow1DArray
 315       SrcZ = SrcY;
 316       CTZ = 0;
 317       break;
 318     case 12: // Shadow2DArray
 319       CTZ = 0;
 320       break;
 321     }
 322     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 323             .addOperand(MI->getOperand(3))
 324             .addImm(SrcX)
 325             .addImm(SrcY)
 326             .addImm(SrcZ)
 327             .addImm(SrcW)
 328             .addImm(0)
 329             .addImm(0)
 330             .addImm(0)
 331             .addImm(0)
 332             .addImm(1)
 333             .addImm(2)
 334             .addImm(3)
 335             .addOperand(RID)
 336             .addOperand(SID)
 337             .addImm(CTX)
 338             .addImm(CTY)
 339             .addImm(CTZ)
 340             .addImm(CTW);
 341     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 342             .addOperand(MI->getOperand(2))
 343             .addImm(SrcX)
 344             .addImm(SrcY)
 345             .addImm(SrcZ)
 346             .addImm(SrcW)
 347             .addImm(0)
 348             .addImm(0)
 349             .addImm(0)
 350             .addImm(0)
 351             .addImm(1)
 352             .addImm(2)
 353             .addImm(3)
 354             .addOperand(RID)
 355             .addOperand(SID)
 356             .addImm(CTX)
 357             .addImm(CTY)
 358             .addImm(CTZ)
 359             .addImm(CTW);
 360     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 361             .addOperand(MI->getOperand(0))
 362             .addOperand(MI->getOperand(1))
 363             .addImm(SrcX)
 364             .addImm(SrcY)
 365             .addImm(SrcZ)
 366             .addImm(SrcW)
 367             .addImm(0)
 368             .addImm(0)
 369             .addImm(0)
 370             .addImm(0)
 371             .addImm(1)
 372             .addImm(2)
 373             .addImm(3)
 374             .addOperand(RID)
 375             .addOperand(SID)
 376             .addImm(CTX)
 377             .addImm(CTY)
 378             .addImm(CTZ)
 379             .addImm(CTW)
 380             .addReg(T0, RegState::Implicit)
 381             .addReg(T1, RegState::Implicit);
 382     break;
 383   }
 384
 385   case AMDGPU::TXD_SHADOW: {
 386     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 387     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 388     MachineOperand &RID = MI->getOperand(4);
 389     MachineOperand &SID = MI->getOperand(5);
 390     unsigned TextureId = MI->getOperand(6).getImm();
 391     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 392     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 393
 394     switch (TextureId) {
 395     case 5: // Rect
 396       CTX = CTY = 0;
 397       break;
 398     case 6: // Shadow1D
 399       SrcW = SrcZ;
 400       break;
 401     case 7: // Shadow2D
 402       SrcW = SrcZ;
 403       break;
 404     case 8: // ShadowRect
 405       CTX = CTY = 0;
 406       SrcW = SrcZ;
 407       break;
 408     case 9: // 1DArray
 409       SrcZ = SrcY;
 410       CTZ = 0;
 411       break;
 412     case 10: // 2DArray
 413       CTZ = 0;
 414       break;
 415     case 11: // Shadow1DArray
 416       SrcZ = SrcY;
 417       CTZ = 0;
 418       break;
 419     case 12: // Shadow2DArray
 420       CTZ = 0;
 421       break;
 422     }
 423
 424     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 425             .addOperand(MI->getOperand(3))
 426             .addImm(SrcX)
 427             .addImm(SrcY)
 428             .addImm(SrcZ)
 429             .addImm(SrcW)
 430             .addImm(0)
 431             .addImm(0)
 432             .addImm(0)
 433             .addImm(0)
 434             .addImm(1)
 435             .addImm(2)
 436             .addImm(3)
 437             .addOperand(RID)
 438             .addOperand(SID)
 439             .addImm(CTX)
 440             .addImm(CTY)
 441             .addImm(CTZ)
 442             .addImm(CTW);
 443     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 444             .addOperand(MI->getOperand(2))
 445             .addImm(SrcX)
 446             .addImm(SrcY)
 447             .addImm(SrcZ)
 448             .addImm(SrcW)
 449             .addImm(0)
 450             .addImm(0)
 451             .addImm(0)
 452             .addImm(0)
 453             .addImm(1)
 454             .addImm(2)
 455             .addImm(3)
 456             .addOperand(RID)
 457             .addOperand(SID)
 458             .addImm(CTX)
 459             .addImm(CTY)
 460             .addImm(CTZ)
 461             .addImm(CTW);
 462     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 463             .addOperand(MI->getOperand(0))
 464             .addOperand(MI->getOperand(1))
 465             .addImm(SrcX)
 466             .addImm(SrcY)
 467             .addImm(SrcZ)
 468             .addImm(SrcW)
 469             .addImm(0)
 470             .addImm(0)
 471             .addImm(0)
 472             .addImm(0)
 473             .addImm(1)
 474             .addImm(2)
 475             .addImm(3)
 476             .addOperand(RID)
 477             .addOperand(SID)
 478             .addImm(CTX)
 479             .addImm(CTY)
 480             .addImm(CTZ)
 481             .addImm(CTW)
 482             .addReg(T0, RegState::Implicit)
 483             .addReg(T1, RegState::Implicit);
 484     break;
 485   }
 486
 487   case AMDGPU::BRANCH:
 488       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 489               .addOperand(MI->getOperand(0));
 490       break;
 491
 492   case AMDGPU::BRANCH_COND_f32: {
 493     MachineInstr *NewMI =
 494       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 495               AMDGPU::PREDICATE_BIT)
 496               .addOperand(MI->getOperand(1))
 497               .addImm(OPCODE_IS_NOT_ZERO)
 498               .addImm(0); // Flags
 499     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 500     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 501             .addOperand(MI->getOperand(0))
 502             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 503     break;
 504   }
 505
 506   case AMDGPU::BRANCH_COND_i32: {
 507     MachineInstr *NewMI =
 508       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 509             AMDGPU::PREDICATE_BIT)
 510             .addOperand(MI->getOperand(1))
 511             .addImm(OPCODE_IS_NOT_ZERO_INT)
 512             .addImm(0); // Flags
 513     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 514     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 515            .addOperand(MI->getOperand(0))
 516             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 517     break;
 518   }
 519
 520   case AMDGPU::EG_ExportSwz:
 521   case AMDGPU::R600_ExportSwz: {
 522     // Instruction is left unmodified if its not the last one of its type
 523     bool isLastInstructionOfItsType = true;
 524     unsigned InstExportType = MI->getOperand(1).getImm();
 525     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 526          EndBlock = BB->end(); NextExportInst != EndBlock;
 527          NextExportInst = std::next(NextExportInst)) {
 528       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 529           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 530         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 531             .getImm();
 532         if (CurrentInstExportType == InstExportType) {
 533           isLastInstructionOfItsType = false;
 534           break;
 535         }
 536       }
 537     }
 538     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 539     if (!EOP && !isLastInstructionOfItsType)
 540       return BB;
 541     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 542     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 543             .addOperand(MI->getOperand(0))
 544             .addOperand(MI->getOperand(1))
 545             .addOperand(MI->getOperand(2))
 546             .addOperand(MI->getOperand(3))
 547             .addOperand(MI->getOperand(4))
 548             .addOperand(MI->getOperand(5))
 549             .addOperand(MI->getOperand(6))
 550             .addImm(CfInst)
 551             .addImm(EOP);
 552     break;
 553   }
 554   case AMDGPU::RETURN: {
 555     // RETURN instructions must have the live-out registers as implicit uses,
 556     // otherwise they appear dead.
 557     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 558     MachineInstrBuilder MIB(*MF, MI);
 559     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 560       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 561     return BB;
 562   }
 563   }
 564
 565   MI->eraseFromParent();
 566   return BB;
 567 }
 568
 569 //===----------------------------------------------------------------------===//
 570 // Custom DAG Lowering Operations
 571 //===----------------------------------------------------------------------===//
 572
 573 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 574   MachineFunction &MF = DAG.getMachineFunction();
 575   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 576   switch (Op.getOpcode()) {
 577   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 578   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 579   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 580   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 581   case ISD::SRA_PARTS:
 582   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 583   case ISD::FCOS:
 584   case ISD::FSIN: return LowerTrig(Op, DAG);
 585   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 586   case ISD::STORE: return LowerSTORE(Op, DAG);
 587   case ISD::LOAD: {
 588     SDValue Result = LowerLOAD(Op, DAG);
 589     assert((!Result.getNode() ||
 590             Result.getNode()->getNumValues() == 2) &&
 591            "Load should return a value and a chain");
 592     return Result;
 593   }
 594
 595   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 596   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 597   case ISD::INTRINSIC_VOID: {
 598     SDValue Chain = Op.getOperand(0);
 599     unsigned IntrinsicID =
 600                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 601     switch (IntrinsicID) {
 602     case AMDGPUIntrinsic::AMDGPU_store_output: {
 603       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 604       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 605       MFI->LiveOuts.push_back(Reg);
 606       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 607     }
 608     case AMDGPUIntrinsic::R600_store_swizzle: {
 609       const SDValue Args[8] = {
 610         Chain,
 611         Op.getOperand(2), // Export Value
 612         Op.getOperand(3), // ArrayBase
 613         Op.getOperand(4), // Type
 614         DAG.getConstant(0, MVT::i32), // SWZ_X
 615         DAG.getConstant(1, MVT::i32), // SWZ_Y
 616         DAG.getConstant(2, MVT::i32), // SWZ_Z
 617         DAG.getConstant(3, MVT::i32) // SWZ_W
 618       };
 619       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 620     }
 621
 622     // default for switch(IntrinsicID)
 623     default: break;
 624     }
 625     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 626     break;
 627   }
 628   case ISD::INTRINSIC_WO_CHAIN: {
 629     unsigned IntrinsicID =
 630                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 631     EVT VT = Op.getValueType();
 632     SDLoc DL(Op);
 633     switch(IntrinsicID) {
 634     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 635     case AMDGPUIntrinsic::R600_load_input: {
 636       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 637       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 638       MachineFunction &MF = DAG.getMachineFunction();
 639       MachineRegisterInfo &MRI = MF.getRegInfo();
 640       MRI.addLiveIn(Reg);
 641       return DAG.getCopyFromReg(DAG.getEntryNode(),
 642           SDLoc(DAG.getEntryNode()), Reg, VT);
 643     }
 644
 645     case AMDGPUIntrinsic::R600_interp_input: {
 646       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 647       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 648       MachineSDNode *interp;
 649       if (ijb < 0) {
 650         const MachineFunction &MF = DAG.getMachineFunction();
 651         const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
 652             MF.getSubtarget().getInstrInfo());
 653         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 654             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 655         return DAG.getTargetExtractSubreg(
 656             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 657             DL, MVT::f32, SDValue(interp, 0));
 658       }
 659       MachineFunction &MF = DAG.getMachineFunction();
 660       MachineRegisterInfo &MRI = MF.getRegInfo();
 661       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 662       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 663       MRI.addLiveIn(RegisterI);
 664       MRI.addLiveIn(RegisterJ);
 665       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 666           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 667       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 668           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 669
 670       if (slot % 4 < 2)
 671         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 672             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 673             RegisterJNode, RegisterINode);
 674       else
 675         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 676             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 677             RegisterJNode, RegisterINode);
 678       return SDValue(interp, slot % 2);
 679     }
 680     case AMDGPUIntrinsic::R600_interp_xy:
 681     case AMDGPUIntrinsic::R600_interp_zw: {
 682       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 683       MachineSDNode *interp;
 684       SDValue RegisterINode = Op.getOperand(2);
 685       SDValue RegisterJNode = Op.getOperand(3);
 686
 687       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 688         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 689             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 690             RegisterJNode, RegisterINode);
 691       else
 692         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 693             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 694             RegisterJNode, RegisterINode);
 695       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 696           SDValue(interp, 0), SDValue(interp, 1));
 697     }
 698     case AMDGPUIntrinsic::R600_tex:
 699     case AMDGPUIntrinsic::R600_texc:
 700     case AMDGPUIntrinsic::R600_txl:
 701     case AMDGPUIntrinsic::R600_txlc:
 702     case AMDGPUIntrinsic::R600_txb:
 703     case AMDGPUIntrinsic::R600_txbc:
 704     case AMDGPUIntrinsic::R600_txf:
 705     case AMDGPUIntrinsic::R600_txq:
 706     case AMDGPUIntrinsic::R600_ddx:
 707     case AMDGPUIntrinsic::R600_ddy:
 708     case AMDGPUIntrinsic::R600_ldptr: {
 709       unsigned TextureOp;
 710       switch (IntrinsicID) {
 711       case AMDGPUIntrinsic::R600_tex:
 712         TextureOp = 0;
 713         break;
 714       case AMDGPUIntrinsic::R600_texc:
 715         TextureOp = 1;
 716         break;
 717       case AMDGPUIntrinsic::R600_txl:
 718         TextureOp = 2;
 719         break;
 720       case AMDGPUIntrinsic::R600_txlc:
 721         TextureOp = 3;
 722         break;
 723       case AMDGPUIntrinsic::R600_txb:
 724         TextureOp = 4;
 725         break;
 726       case AMDGPUIntrinsic::R600_txbc:
 727         TextureOp = 5;
 728         break;
 729       case AMDGPUIntrinsic::R600_txf:
 730         TextureOp = 6;
 731         break;
 732       case AMDGPUIntrinsic::R600_txq:
 733         TextureOp = 7;
 734         break;
 735       case AMDGPUIntrinsic::R600_ddx:
 736         TextureOp = 8;
 737         break;
 738       case AMDGPUIntrinsic::R600_ddy:
 739         TextureOp = 9;
 740         break;
 741       case AMDGPUIntrinsic::R600_ldptr:
 742         TextureOp = 10;
 743         break;
 744       default:
 745         llvm_unreachable("Unknow Texture Operation");
 746       }
 747
 748       SDValue TexArgs[19] = {
 749         DAG.getConstant(TextureOp, MVT::i32),
 750         Op.getOperand(1),
 751         DAG.getConstant(0, MVT::i32),
 752         DAG.getConstant(1, MVT::i32),
 753         DAG.getConstant(2, MVT::i32),
 754         DAG.getConstant(3, MVT::i32),
 755         Op.getOperand(2),
 756         Op.getOperand(3),
 757         Op.getOperand(4),
 758         DAG.getConstant(0, MVT::i32),
 759         DAG.getConstant(1, MVT::i32),
 760         DAG.getConstant(2, MVT::i32),
 761         DAG.getConstant(3, MVT::i32),
 762         Op.getOperand(5),
 763         Op.getOperand(6),
 764         Op.getOperand(7),
 765         Op.getOperand(8),
 766         Op.getOperand(9),
 767         Op.getOperand(10)
 768       };
 769       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 770     }
 771     case AMDGPUIntrinsic::AMDGPU_dp4: {
 772       SDValue Args[8] = {
 773       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 774           DAG.getConstant(0, MVT::i32)),
 775       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 776           DAG.getConstant(0, MVT::i32)),
 777       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 778           DAG.getConstant(1, MVT::i32)),
 779       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 780           DAG.getConstant(1, MVT::i32)),
 781       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 782           DAG.getConstant(2, MVT::i32)),
 783       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 784           DAG.getConstant(2, MVT::i32)),
 785       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 786           DAG.getConstant(3, MVT::i32)),
 787       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 788           DAG.getConstant(3, MVT::i32))
 789       };
 790       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 791     }
 792
 793     case Intrinsic::r600_read_ngroups_x:
 794       return LowerImplicitParameter(DAG, VT, DL, 0);
 795     case Intrinsic::r600_read_ngroups_y:
 796       return LowerImplicitParameter(DAG, VT, DL, 1);
 797     case Intrinsic::r600_read_ngroups_z:
 798       return LowerImplicitParameter(DAG, VT, DL, 2);
 799     case Intrinsic::r600_read_global_size_x:
 800       return LowerImplicitParameter(DAG, VT, DL, 3);
 801     case Intrinsic::r600_read_global_size_y:
 802       return LowerImplicitParameter(DAG, VT, DL, 4);
 803     case Intrinsic::r600_read_global_size_z:
 804       return LowerImplicitParameter(DAG, VT, DL, 5);
 805     case Intrinsic::r600_read_local_size_x:
 806       return LowerImplicitParameter(DAG, VT, DL, 6);
 807     case Intrinsic::r600_read_local_size_y:
 808       return LowerImplicitParameter(DAG, VT, DL, 7);
 809     case Intrinsic::r600_read_local_size_z:
 810       return LowerImplicitParameter(DAG, VT, DL, 8);
 811
 812     case Intrinsic::AMDGPU_read_workdim:
 813       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
 814
 815     case Intrinsic::r600_read_tgid_x:
 816       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 817                                   AMDGPU::T1_X, VT);
 818     case Intrinsic::r600_read_tgid_y:
 819       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 820                                   AMDGPU::T1_Y, VT);
 821     case Intrinsic::r600_read_tgid_z:
 822       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 823                                   AMDGPU::T1_Z, VT);
 824     case Intrinsic::r600_read_tidig_x:
 825       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 826                                   AMDGPU::T0_X, VT);
 827     case Intrinsic::r600_read_tidig_y:
 828       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 829                                   AMDGPU::T0_Y, VT);
 830     case Intrinsic::r600_read_tidig_z:
 831       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 832                                   AMDGPU::T0_Z, VT);
 833     case Intrinsic::AMDGPU_rsq:
 834       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 835       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 836     }
 837     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 838     break;
 839   }
 840   } // end switch(Op.getOpcode())
 841   return SDValue();
 842 }
 843
 844 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 845                                             SmallVectorImpl<SDValue> &Results,
 846                                             SelectionDAG &DAG) const {
 847   switch (N->getOpcode()) {
 848   default:
 849     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 850     return;
 851   case ISD::FP_TO_UINT:
 852     if (N->getValueType(0) == MVT::i1) {
 853       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 854       return;
 855     }
 856     // Fall-through. Since we don't care about out of bounds values
 857     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 858     // considers some extra cases which are not necessary here.
 859   case ISD::FP_TO_SINT: {
 860     SDValue Result;
 861     if (expandFP_TO_SINT(N, Result, DAG))
 862       Results.push_back(Result);
 863     return;
 864   }
 865   case ISD::UDIV: {
 866     SDValue Op = SDValue(N, 0);
 867     SDLoc DL(Op);
 868     EVT VT = Op.getValueType();
 869     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 870       N->getOperand(0), N->getOperand(1));
 871     Results.push_back(UDIVREM);
 872     break;
 873   }
 874   case ISD::UREM: {
 875     SDValue Op = SDValue(N, 0);
 876     SDLoc DL(Op);
 877     EVT VT = Op.getValueType();
 878     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 879       N->getOperand(0), N->getOperand(1));
 880     Results.push_back(UDIVREM.getValue(1));
 881     break;
 882   }
 883   case ISD::SDIV: {
 884     SDValue Op = SDValue(N, 0);
 885     SDLoc DL(Op);
 886     EVT VT = Op.getValueType();
 887     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 888       N->getOperand(0), N->getOperand(1));
 889     Results.push_back(SDIVREM);
 890     break;
 891   }
 892   case ISD::SREM: {
 893     SDValue Op = SDValue(N, 0);
 894     SDLoc DL(Op);
 895     EVT VT = Op.getValueType();
 896     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 897       N->getOperand(0), N->getOperand(1));
 898     Results.push_back(SDIVREM.getValue(1));
 899     break;
 900   }
 901   case ISD::SDIVREM: {
 902     SDValue Op = SDValue(N, 1);
 903     SDValue RES = LowerSDIVREM(Op, DAG);
 904     Results.push_back(RES);
 905     Results.push_back(RES.getValue(1));
 906     break;
 907   }
 908   case ISD::UDIVREM: {
 909     SDValue Op = SDValue(N, 0);
 910     SDLoc DL(Op);
 911     EVT VT = Op.getValueType();
 912     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
 913
 914     SDValue one = DAG.getConstant(1, HalfVT);
 915     SDValue zero = DAG.getConstant(0, HalfVT);
 916
 917     //HiLo split
 918     SDValue LHS = N->getOperand(0);
 919     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
 920     SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
 921
 922     SDValue RHS = N->getOperand(1);
 923     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
 924     SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
 925
 926     // Get Speculative values
 927     SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
 928     SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
 929
 930     SDValue REM_Hi = zero;
 931     SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
 932
 933     SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
 934     SDValue DIV_Lo = zero;
 935
 936     const unsigned halfBitWidth = HalfVT.getSizeInBits();
 937
 938     for (unsigned i = 0; i < halfBitWidth; ++i) {
 939       SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
 940       // Get Value of high bit
 941       SDValue HBit;
 942       if (halfBitWidth == 32 && Subtarget->hasBFE()) {
 943         HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
 944       } else {
 945         HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
 946         HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
 947       }
 948
 949       SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
 950         DAG.getConstant(halfBitWidth - 1, HalfVT));
 951       REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
 952       REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
 953
 954       REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
 955       REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
 956
 957
 958       SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 959
 960       SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
 961       SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
 962
 963       DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
 964
 965       // Update REM
 966
 967       SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
 968
 969       REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
 970       REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
 971       REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
 972     }
 973
 974     SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 975     SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
 976     Results.push_back(DIV);
 977     Results.push_back(REM);
 978     break;
 979   }
 980   }
 981 }
 982
 983 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 984                                                    SDValue Vector) const {
 985
 986   SDLoc DL(Vector);
 987   EVT VecVT = Vector.getValueType();
 988   EVT EltVT = VecVT.getVectorElementType();
 989   SmallVector<SDValue, 8> Args;
 990
 991   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 992                                                            i != e; ++i) {
 993     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 994                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 995   }
 996
 997   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 998 }
 999
1000 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1001                                                     SelectionDAG &DAG) const {
1002
1003   SDLoc DL(Op);
1004   SDValue Vector = Op.getOperand(0);
1005   SDValue Index = Op.getOperand(1);
1006
1007   if (isa<ConstantSDNode>(Index) ||
1008       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
1009     return Op;
1010
1011   Vector = vectorToVerticalVector(DAG, Vector);
1012   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
1013                      Vector, Index);
1014 }
1015
1016 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
1017                                                    SelectionDAG &DAG) const {
1018   SDLoc DL(Op);
1019   SDValue Vector = Op.getOperand(0);
1020   SDValue Value = Op.getOperand(1);
1021   SDValue Index = Op.getOperand(2);
1022
1023   if (isa<ConstantSDNode>(Index) ||
1024       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
1025     return Op;
1026
1027   Vector = vectorToVerticalVector(DAG, Vector);
1028   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
1029                                Vector, Value, Index);
1030   return vectorToVerticalVector(DAG, Insert);
1031 }
1032
1033 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
1034   // On hw >= R700, COS/SIN input must be between -1. and 1.
1035   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
1036   EVT VT = Op.getValueType();
1037   SDValue Arg = Op.getOperand(0);
1038   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
1039       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
1040         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
1041           DAG.getConstantFP(0.15915494309, MVT::f32)),
1042         DAG.getConstantFP(0.5, MVT::f32)));
1043   unsigned TrigNode;
1044   switch (Op.getOpcode()) {
1045   case ISD::FCOS:
1046     TrigNode = AMDGPUISD::COS_HW;
1047     break;
1048   case ISD::FSIN:
1049     TrigNode = AMDGPUISD::SIN_HW;
1050     break;
1051   default:
1052     llvm_unreachable("Wrong trig opcode");
1053   }
1054   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
1055       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
1056         DAG.getConstantFP(-0.5, MVT::f32)));
1057   if (Gen >= AMDGPUSubtarget::R700)
1058     return TrigVal;
1059   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1060   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1061       DAG.getConstantFP(3.14159265359, MVT::f32));
1062 }
1063
1064 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1065   SDLoc DL(Op);
1066   EVT VT = Op.getValueType();
1067
1068   SDValue Lo = Op.getOperand(0);
1069   SDValue Hi = Op.getOperand(1);
1070   SDValue Shift = Op.getOperand(2);
1071   SDValue Zero = DAG.getConstant(0, VT);
1072   SDValue One  = DAG.getConstant(1, VT);
1073
1074   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1075   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1076   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1077   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1078
1079   // The dance around Width1 is necessary for 0 special case.
1080   // Without it the CompShift might be 32, producing incorrect results in
1081   // Overflow. So we do the shift in two steps, the alternative is to
1082   // add a conditional to filter the special case.
1083
1084   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1085   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1086
1087   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1088   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1089   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1090
1091   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1092   SDValue LoBig = Zero;
1093
1094   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1095   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1096
1097   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1098 }
1099
1100 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1101   SDLoc DL(Op);
1102   EVT VT = Op.getValueType();
1103
1104   SDValue Lo = Op.getOperand(0);
1105   SDValue Hi = Op.getOperand(1);
1106   SDValue Shift = Op.getOperand(2);
1107   SDValue Zero = DAG.getConstant(0, VT);
1108   SDValue One  = DAG.getConstant(1, VT);
1109
1110   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1111
1112   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1113   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1114   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1115   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1116
1117   // The dance around Width1 is necessary for 0 special case.
1118   // Without it the CompShift might be 32, producing incorrect results in
1119   // Overflow. So we do the shift in two steps, the alternative is to
1120   // add a conditional to filter the special case.
1121
1122   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1123   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1124
1125   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1126   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1127   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1128
1129   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1130   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1131
1132   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1133   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1134
1135   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1136 }
1137
1138 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1139   return DAG.getNode(
1140       ISD::SETCC,
1141       SDLoc(Op),
1142       MVT::i1,
1143       Op, DAG.getConstantFP(0.0f, MVT::f32),
1144       DAG.getCondCode(ISD::SETNE)
1145       );
1146 }
1147
1148 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1149                                                    SDLoc DL,
1150                                                    unsigned DwordOffset) const {
1151   unsigned ByteOffset = DwordOffset * 4;
1152   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1153                                       AMDGPUAS::CONSTANT_BUFFER_0);
1154
1155   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1156   assert(isInt<16>(ByteOffset));
1157
1158   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1159                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1160                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1161                      false, false, false, 0);
1162 }
1163
1164 bool R600TargetLowering::isZero(SDValue Op) const {
1165   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1166     return Cst->isNullValue();
1167   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1168     return CstFP->isZero();
1169   } else {
1170     return false;
1171   }
1172 }
1173
1174 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1175   SDLoc DL(Op);
1176   EVT VT = Op.getValueType();
1177
1178   SDValue LHS = Op.getOperand(0);
1179   SDValue RHS = Op.getOperand(1);
1180   SDValue True = Op.getOperand(2);
1181   SDValue False = Op.getOperand(3);
1182   SDValue CC = Op.getOperand(4);
1183   SDValue Temp;
1184
1185   // LHS and RHS are guaranteed to be the same value type
1186   EVT CompareVT = LHS.getValueType();
1187
1188   // Check if we can lower this to a native operation.
1189
1190   // Try to lower to a SET* instruction:
1191   //
1192   // SET* can match the following patterns:
1193   //
1194   // select_cc f32, f32, -1,  0, cc_supported
1195   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1196   // select_cc i32, i32, -1,  0, cc_supported
1197   //
1198
1199   // Move hardware True/False values to the correct operand.
1200   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1201   ISD::CondCode InverseCC =
1202      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1203   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1204     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1205       std::swap(False, True);
1206       CC = DAG.getCondCode(InverseCC);
1207     } else {
1208       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1209       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1210         std::swap(False, True);
1211         std::swap(LHS, RHS);
1212         CC = DAG.getCondCode(SwapInvCC);
1213       }
1214     }
1215   }
1216
1217   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1218       (CompareVT == VT || VT == MVT::i32)) {
1219     // This can be matched by a SET* instruction.
1220     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1221   }
1222
1223   // Try to lower to a CND* instruction:
1224   //
1225   // CND* can match the following patterns:
1226   //
1227   // select_cc f32, 0.0, f32, f32, cc_supported
1228   // select_cc f32, 0.0, i32, i32, cc_supported
1229   // select_cc i32, 0,   f32, f32, cc_supported
1230   // select_cc i32, 0,   i32, i32, cc_supported
1231   //
1232
1233   // Try to move the zero value to the RHS
1234   if (isZero(LHS)) {
1235     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1236     // Try swapping the operands
1237     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1238     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1239       std::swap(LHS, RHS);
1240       CC = DAG.getCondCode(CCSwapped);
1241     } else {
1242       // Try inverting the conditon and then swapping the operands
1243       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1244       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1245       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1246         std::swap(True, False);
1247         std::swap(LHS, RHS);
1248         CC = DAG.getCondCode(CCSwapped);
1249       }
1250     }
1251   }
1252   if (isZero(RHS)) {
1253     SDValue Cond = LHS;
1254     SDValue Zero = RHS;
1255     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1256     if (CompareVT != VT) {
1257       // Bitcast True / False to the correct types.  This will end up being
1258       // a nop, but it allows us to define only a single pattern in the
1259       // .TD files for each CND* instruction rather than having to have
1260       // one pattern for integer True/False and one for fp True/False
1261       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1262       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1263     }
1264
1265     switch (CCOpcode) {
1266     case ISD::SETONE:
1267     case ISD::SETUNE:
1268     case ISD::SETNE:
1269       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1270       Temp = True;
1271       True = False;
1272       False = Temp;
1273       break;
1274     default:
1275       break;
1276     }
1277     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1278         Cond, Zero,
1279         True, False,
1280         DAG.getCondCode(CCOpcode));
1281     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1282   }
1283
1284   // If we make it this for it means we have no native instructions to handle
1285   // this SELECT_CC, so we must lower it.
1286   SDValue HWTrue, HWFalse;
1287
1288   if (CompareVT == MVT::f32) {
1289     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1290     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1291   } else if (CompareVT == MVT::i32) {
1292     HWTrue = DAG.getConstant(-1, CompareVT);
1293     HWFalse = DAG.getConstant(0, CompareVT);
1294   }
1295   else {
1296     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1297   }
1298
1299   // Lower this unsupported SELECT_CC into a combination of two supported
1300   // SELECT_CC operations.
1301   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1302
1303   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1304       Cond, HWFalse,
1305       True, False,
1306       DAG.getCondCode(ISD::SETNE));
1307 }
1308
1309 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1310 /// convert these pointers to a register index.  Each register holds
1311 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1312 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1313 /// for indirect addressing.
1314 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1315                                                unsigned StackWidth,
1316                                                SelectionDAG &DAG) const {
1317   unsigned SRLPad;
1318   switch(StackWidth) {
1319   case 1:
1320     SRLPad = 2;
1321     break;
1322   case 2:
1323     SRLPad = 3;
1324     break;
1325   case 4:
1326     SRLPad = 4;
1327     break;
1328   default: llvm_unreachable("Invalid stack width");
1329   }
1330
1331   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1332                      DAG.getConstant(SRLPad, MVT::i32));
1333 }
1334
1335 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1336                                          unsigned ElemIdx,
1337                                          unsigned &Channel,
1338                                          unsigned &PtrIncr) const {
1339   switch (StackWidth) {
1340   default:
1341   case 1:
1342     Channel = 0;
1343     if (ElemIdx > 0) {
1344       PtrIncr = 1;
1345     } else {
1346       PtrIncr = 0;
1347     }
1348     break;
1349   case 2:
1350     Channel = ElemIdx % 2;
1351     if (ElemIdx == 2) {
1352       PtrIncr = 1;
1353     } else {
1354       PtrIncr = 0;
1355     }
1356     break;
1357   case 4:
1358     Channel = ElemIdx;
1359     PtrIncr = 0;
1360     break;
1361   }
1362 }
1363
1364 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1365   SDLoc DL(Op);
1366   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1367   SDValue Chain = Op.getOperand(0);
1368   SDValue Value = Op.getOperand(1);
1369   SDValue Ptr = Op.getOperand(2);
1370
1371   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1372   if (Result.getNode()) {
1373     return Result;
1374   }
1375
1376   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1377     if (StoreNode->isTruncatingStore()) {
1378       EVT VT = Value.getValueType();
1379       assert(VT.bitsLE(MVT::i32));
1380       EVT MemVT = StoreNode->getMemoryVT();
1381       SDValue MaskConstant;
1382       if (MemVT == MVT::i8) {
1383         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1384       } else {
1385         assert(MemVT == MVT::i16);
1386         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1387       }
1388       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1389                                       DAG.getConstant(2, MVT::i32));
1390       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1391                                       DAG.getConstant(0x00000003, VT));
1392       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1393       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1394                                    DAG.getConstant(3, VT));
1395       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1396       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1397       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1398       // vector instead.
1399       SDValue Src[4] = {
1400         ShiftedValue,
1401         DAG.getConstant(0, MVT::i32),
1402         DAG.getConstant(0, MVT::i32),
1403         Mask
1404       };
1405       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1406       SDValue Args[3] = { Chain, Input, DWordAddr };
1407       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1408                                      Op->getVTList(), Args, MemVT,
1409                                      StoreNode->getMemOperand());
1410     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1411                Value.getValueType().bitsGE(MVT::i32)) {
1412       // Convert pointer from byte address to dword address.
1413       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1414                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1415                                     Ptr, DAG.getConstant(2, MVT::i32)));
1416
1417       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1418         llvm_unreachable("Truncated and indexed stores not supported yet");
1419       } else {
1420         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1421       }
1422       return Chain;
1423     }
1424   }
1425
1426   EVT ValueVT = Value.getValueType();
1427
1428   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1429     return SDValue();
1430   }
1431
1432   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1433   if (Ret.getNode()) {
1434     return Ret;
1435   }
1436   // Lowering for indirect addressing
1437
1438   const MachineFunction &MF = DAG.getMachineFunction();
1439   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1440       getTargetMachine().getSubtargetImpl()->getFrameLowering());
1441   unsigned StackWidth = TFL->getStackWidth(MF);
1442
1443   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1444
1445   if (ValueVT.isVector()) {
1446     unsigned NumElemVT = ValueVT.getVectorNumElements();
1447     EVT ElemVT = ValueVT.getVectorElementType();
1448     SmallVector<SDValue, 4> Stores(NumElemVT);
1449
1450     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1451                                       "vector width in load");
1452
1453     for (unsigned i = 0; i < NumElemVT; ++i) {
1454       unsigned Channel, PtrIncr;
1455       getStackAddress(StackWidth, i, Channel, PtrIncr);
1456       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1457                         DAG.getConstant(PtrIncr, MVT::i32));
1458       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1459                                  Value, DAG.getConstant(i, MVT::i32));
1460
1461       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1462                               Chain, Elem, Ptr,
1463                               DAG.getTargetConstant(Channel, MVT::i32));
1464     }
1465      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1466    } else {
1467     if (ValueVT == MVT::i8) {
1468       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1469     }
1470     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1471     DAG.getTargetConstant(0, MVT::i32)); // Channel
1472   }
1473
1474   return Chain;
1475 }
1476
1477 // return (512 + (kc_bank << 12)
1478 static int
1479 ConstantAddressBlock(unsigned AddressSpace) {
1480   switch (AddressSpace) {
1481   case AMDGPUAS::CONSTANT_BUFFER_0:
1482     return 512;
1483   case AMDGPUAS::CONSTANT_BUFFER_1:
1484     return 512 + 4096;
1485   case AMDGPUAS::CONSTANT_BUFFER_2:
1486     return 512 + 4096 * 2;
1487   case AMDGPUAS::CONSTANT_BUFFER_3:
1488     return 512 + 4096 * 3;
1489   case AMDGPUAS::CONSTANT_BUFFER_4:
1490     return 512 + 4096 * 4;
1491   case AMDGPUAS::CONSTANT_BUFFER_5:
1492     return 512 + 4096 * 5;
1493   case AMDGPUAS::CONSTANT_BUFFER_6:
1494     return 512 + 4096 * 6;
1495   case AMDGPUAS::CONSTANT_BUFFER_7:
1496     return 512 + 4096 * 7;
1497   case AMDGPUAS::CONSTANT_BUFFER_8:
1498     return 512 + 4096 * 8;
1499   case AMDGPUAS::CONSTANT_BUFFER_9:
1500     return 512 + 4096 * 9;
1501   case AMDGPUAS::CONSTANT_BUFFER_10:
1502     return 512 + 4096 * 10;
1503   case AMDGPUAS::CONSTANT_BUFFER_11:
1504     return 512 + 4096 * 11;
1505   case AMDGPUAS::CONSTANT_BUFFER_12:
1506     return 512 + 4096 * 12;
1507   case AMDGPUAS::CONSTANT_BUFFER_13:
1508     return 512 + 4096 * 13;
1509   case AMDGPUAS::CONSTANT_BUFFER_14:
1510     return 512 + 4096 * 14;
1511   case AMDGPUAS::CONSTANT_BUFFER_15:
1512     return 512 + 4096 * 15;
1513   default:
1514     return -1;
1515   }
1516 }
1517
1518 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1519 {
1520   EVT VT = Op.getValueType();
1521   SDLoc DL(Op);
1522   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1523   SDValue Chain = Op.getOperand(0);
1524   SDValue Ptr = Op.getOperand(1);
1525   SDValue LoweredLoad;
1526
1527   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1528   if (Ret.getNode()) {
1529     SDValue Ops[2] = {
1530       Ret,
1531       Chain
1532     };
1533     return DAG.getMergeValues(Ops, DL);
1534   }
1535
1536   // Lower loads constant address space global variable loads
1537   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1538       isa<GlobalVariable>(
1539           GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
1540
1541     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1542         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1543     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1544         DAG.getConstant(2, MVT::i32));
1545     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1546                        LoadNode->getChain(), Ptr,
1547                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1548   }
1549
1550   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1551     SDValue MergedValues[2] = {
1552       ScalarizeVectorLoad(Op, DAG),
1553       Chain
1554     };
1555     return DAG.getMergeValues(MergedValues, DL);
1556   }
1557
1558   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1559   if (ConstantBlock > -1 &&
1560       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1561        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1562     SDValue Result;
1563     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1564         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1565         isa<ConstantSDNode>(Ptr)) {
1566       SDValue Slots[4];
1567       for (unsigned i = 0; i < 4; i++) {
1568         // We want Const position encoded with the following formula :
1569         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1570         // const_index is Ptr computed by llvm using an alignment of 16.
1571         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1572         // then div by 4 at the ISel step
1573         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1574             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1575         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1576       }
1577       EVT NewVT = MVT::v4i32;
1578       unsigned NumElements = 4;
1579       if (VT.isVector()) {
1580         NewVT = VT;
1581         NumElements = VT.getVectorNumElements();
1582       }
1583       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1584                            makeArrayRef(Slots, NumElements));
1585     } else {
1586       // non-constant ptr can't be folded, keeps it as a v4f32 load
1587       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1588           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1589           DAG.getConstant(LoadNode->getAddressSpace() -
1590                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1591           );
1592     }
1593
1594     if (!VT.isVector()) {
1595       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1596           DAG.getConstant(0, MVT::i32));
1597     }
1598
1599     SDValue MergedValues[2] = {
1600       Result,
1601       Chain
1602     };
1603     return DAG.getMergeValues(MergedValues, DL);
1604   }
1605
1606   // For most operations returning SDValue() will result in the node being
1607   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1608   // need to manually expand loads that may be legal in some address spaces and
1609   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1610   // compute shaders, since the data is sign extended when it is uploaded to the
1611   // buffer. However SEXT loads from other address spaces are not supported, so
1612   // we need to expand them here.
1613   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1614     EVT MemVT = LoadNode->getMemoryVT();
1615     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1616     SDValue ShiftAmount =
1617           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1618     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1619                                   LoadNode->getPointerInfo(), MemVT,
1620                                   LoadNode->isVolatile(),
1621                                   LoadNode->isNonTemporal(),
1622                                   LoadNode->isInvariant(),
1623                                   LoadNode->getAlignment());
1624     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1625     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1626
1627     SDValue MergedValues[2] = { Sra, Chain };
1628     return DAG.getMergeValues(MergedValues, DL);
1629   }
1630
1631   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1632     return SDValue();
1633   }
1634
1635   // Lowering for indirect addressing
1636   const MachineFunction &MF = DAG.getMachineFunction();
1637   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1638       getTargetMachine().getSubtargetImpl()->getFrameLowering());
1639   unsigned StackWidth = TFL->getStackWidth(MF);
1640
1641   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1642
1643   if (VT.isVector()) {
1644     unsigned NumElemVT = VT.getVectorNumElements();
1645     EVT ElemVT = VT.getVectorElementType();
1646     SDValue Loads[4];
1647
1648     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1649                                       "vector width in load");
1650
1651     for (unsigned i = 0; i < NumElemVT; ++i) {
1652       unsigned Channel, PtrIncr;
1653       getStackAddress(StackWidth, i, Channel, PtrIncr);
1654       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1655                         DAG.getConstant(PtrIncr, MVT::i32));
1656       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1657                              Chain, Ptr,
1658                              DAG.getTargetConstant(Channel, MVT::i32),
1659                              Op.getOperand(2));
1660     }
1661     for (unsigned i = NumElemVT; i < 4; ++i) {
1662       Loads[i] = DAG.getUNDEF(ElemVT);
1663     }
1664     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1665     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1666   } else {
1667     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1668                               Chain, Ptr,
1669                               DAG.getTargetConstant(0, MVT::i32), // Channel
1670                               Op.getOperand(2));
1671   }
1672
1673   SDValue Ops[2] = {
1674     LoweredLoad,
1675     Chain
1676   };
1677
1678   return DAG.getMergeValues(Ops, DL);
1679 }
1680
1681 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1682   SDValue Chain = Op.getOperand(0);
1683   SDValue Cond  = Op.getOperand(1);
1684   SDValue Jump  = Op.getOperand(2);
1685
1686   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1687                      Chain, Jump, Cond);
1688 }
1689
1690 /// XXX Only kernel functions are supported, so we can assume for now that
1691 /// every function is a kernel function, but in the future we should use
1692 /// separate calling conventions for kernel and non-kernel functions.
1693 SDValue R600TargetLowering::LowerFormalArguments(
1694                                       SDValue Chain,
1695                                       CallingConv::ID CallConv,
1696                                       bool isVarArg,
1697                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1698                                       SDLoc DL, SelectionDAG &DAG,
1699                                       SmallVectorImpl<SDValue> &InVals) const {
1700   SmallVector<CCValAssign, 16> ArgLocs;
1701   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1702                  *DAG.getContext());
1703   MachineFunction &MF = DAG.getMachineFunction();
1704   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1705
1706   SmallVector<ISD::InputArg, 8> LocalIns;
1707
1708   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1709
1710   AnalyzeFormalArguments(CCInfo, LocalIns);
1711
1712   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1713     CCValAssign &VA = ArgLocs[i];
1714     const ISD::InputArg &In = Ins[i];
1715     EVT VT = In.VT;
1716     EVT MemVT = VA.getLocVT();
1717     if (!VT.isVector() && MemVT.isVector()) {
1718       // Get load source type if scalarized.
1719       MemVT = MemVT.getVectorElementType();
1720     }
1721
1722     if (MFI->getShaderType() != ShaderType::COMPUTE) {
1723       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1724       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1725       InVals.push_back(Register);
1726       continue;
1727     }
1728
1729     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1730                                           AMDGPUAS::CONSTANT_BUFFER_0);
1731
1732     // i64 isn't a legal type, so the register type used ends up as i32, which
1733     // isn't expected here. It attempts to create this sextload, but it ends up
1734     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1735     // for <1 x i64>.
1736
1737     // The first 36 bytes of the input buffer contains information about
1738     // thread group and global sizes.
1739     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1740     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1741       // FIXME: This should really check the extload type, but the handling of
1742       // extload vector parameters seems to be broken.
1743
1744       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1745       Ext = ISD::SEXTLOAD;
1746     }
1747
1748     // Compute the offset from the value.
1749     // XXX - I think PartOffset should give you this, but it seems to give the
1750     // size of the register which isn't useful.
1751
1752     unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
1753     unsigned PartOffset = VA.getLocMemOffset();
1754     unsigned Offset = 36 + VA.getLocMemOffset();
1755
1756     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1757     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1758                               DAG.getConstant(Offset, MVT::i32),
1759                               DAG.getUNDEF(MVT::i32),
1760                               PtrInfo,
1761                               MemVT, false, true, true, 4);
1762
1763     // 4 is the preferred alignment for the CONSTANT memory space.
1764     InVals.push_back(Arg);
1765     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1766   }
1767   return Chain;
1768 }
1769
1770 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1771    if (!VT.isVector())
1772      return MVT::i32;
1773    return VT.changeVectorElementTypeToInteger();
1774 }
1775
1776 static SDValue CompactSwizzlableVector(
1777   SelectionDAG &DAG, SDValue VectorEntry,
1778   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1779   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1780   assert(RemapSwizzle.empty());
1781   SDValue NewBldVec[4] = {
1782     VectorEntry.getOperand(0),
1783     VectorEntry.getOperand(1),
1784     VectorEntry.getOperand(2),
1785     VectorEntry.getOperand(3)
1786   };
1787
1788   for (unsigned i = 0; i < 4; i++) {
1789     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1790       // We mask write here to teach later passes that the ith element of this
1791       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1792       // break false dependencies and additionnaly make assembly easier to read.
1793       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1794     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1795       if (C->isZero()) {
1796         RemapSwizzle[i] = 4; // SEL_0
1797         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1798       } else if (C->isExactlyValue(1.0)) {
1799         RemapSwizzle[i] = 5; // SEL_1
1800         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1801       }
1802     }
1803
1804     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1805       continue;
1806     for (unsigned j = 0; j < i; j++) {
1807       if (NewBldVec[i] == NewBldVec[j]) {
1808         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1809         RemapSwizzle[i] = j;
1810         break;
1811       }
1812     }
1813   }
1814
1815   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1816                      VectorEntry.getValueType(), NewBldVec);
1817 }
1818
1819 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1820                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1821   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1822   assert(RemapSwizzle.empty());
1823   SDValue NewBldVec[4] = {
1824       VectorEntry.getOperand(0),
1825       VectorEntry.getOperand(1),
1826       VectorEntry.getOperand(2),
1827       VectorEntry.getOperand(3)
1828   };
1829   bool isUnmovable[4] = { false, false, false, false };
1830   for (unsigned i = 0; i < 4; i++) {
1831     RemapSwizzle[i] = i;
1832     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1833       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1834           ->getZExtValue();
1835       if (i == Idx)
1836         isUnmovable[Idx] = true;
1837     }
1838   }
1839
1840   for (unsigned i = 0; i < 4; i++) {
1841     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1842       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1843           ->getZExtValue();
1844       if (isUnmovable[Idx])
1845         continue;
1846       // Swap i and Idx
1847       std::swap(NewBldVec[Idx], NewBldVec[i]);
1848       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1849       break;
1850     }
1851   }
1852
1853   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1854                      VectorEntry.getValueType(), NewBldVec);
1855 }
1856
1857
1858 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1859 SDValue Swz[4], SelectionDAG &DAG) const {
1860   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1861   // Old -> New swizzle values
1862   DenseMap<unsigned, unsigned> SwizzleRemap;
1863
1864   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1865   for (unsigned i = 0; i < 4; i++) {
1866     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1867     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1868       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1869   }
1870
1871   SwizzleRemap.clear();
1872   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1873   for (unsigned i = 0; i < 4; i++) {
1874     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1875     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1876       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1877   }
1878
1879   return BuildVector;
1880 }
1881
1882
1883 //===----------------------------------------------------------------------===//
1884 // Custom DAG Optimizations
1885 //===----------------------------------------------------------------------===//
1886
1887 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1888                                               DAGCombinerInfo &DCI) const {
1889   SelectionDAG &DAG = DCI.DAG;
1890
1891   switch (N->getOpcode()) {
1892   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1893   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1894   case ISD::FP_ROUND: {
1895       SDValue Arg = N->getOperand(0);
1896       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1897         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1898                            Arg.getOperand(0));
1899       }
1900       break;
1901     }
1902
1903   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1904   // (i32 select_cc f32, f32, -1, 0 cc)
1905   //
1906   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1907   // this to one of the SET*_DX10 instructions.
1908   case ISD::FP_TO_SINT: {
1909     SDValue FNeg = N->getOperand(0);
1910     if (FNeg.getOpcode() != ISD::FNEG) {
1911       return SDValue();
1912     }
1913     SDValue SelectCC = FNeg.getOperand(0);
1914     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1915         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1916         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1917         !isHWTrueValue(SelectCC.getOperand(2)) ||
1918         !isHWFalseValue(SelectCC.getOperand(3))) {
1919       return SDValue();
1920     }
1921
1922     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1923                            SelectCC.getOperand(0), // LHS
1924                            SelectCC.getOperand(1), // RHS
1925                            DAG.getConstant(-1, MVT::i32), // True
1926                            DAG.getConstant(0, MVT::i32),  // Flase
1927                            SelectCC.getOperand(4)); // CC
1928
1929     break;
1930   }
1931
1932   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1933   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1934   case ISD::INSERT_VECTOR_ELT: {
1935     SDValue InVec = N->getOperand(0);
1936     SDValue InVal = N->getOperand(1);
1937     SDValue EltNo = N->getOperand(2);
1938     SDLoc dl(N);
1939
1940     // If the inserted element is an UNDEF, just use the input vector.
1941     if (InVal.getOpcode() == ISD::UNDEF)
1942       return InVec;
1943
1944     EVT VT = InVec.getValueType();
1945
1946     // If we can't generate a legal BUILD_VECTOR, exit
1947     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1948       return SDValue();
1949
1950     // Check that we know which element is being inserted
1951     if (!isa<ConstantSDNode>(EltNo))
1952       return SDValue();
1953     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1954
1955     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1956     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1957     // vector elements.
1958     SmallVector<SDValue, 8> Ops;
1959     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1960       Ops.append(InVec.getNode()->op_begin(),
1961                  InVec.getNode()->op_end());
1962     } else if (InVec.getOpcode() == ISD::UNDEF) {
1963       unsigned NElts = VT.getVectorNumElements();
1964       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1965     } else {
1966       return SDValue();
1967     }
1968
1969     // Insert the element
1970     if (Elt < Ops.size()) {
1971       // All the operands of BUILD_VECTOR must have the same type;
1972       // we enforce that here.
1973       EVT OpVT = Ops[0].getValueType();
1974       if (InVal.getValueType() != OpVT)
1975         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1976           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1977           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1978       Ops[Elt] = InVal;
1979     }
1980
1981     // Return the new vector
1982     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1983   }
1984
1985   // Extract_vec (Build_vector) generated by custom lowering
1986   // also needs to be customly combined
1987   case ISD::EXTRACT_VECTOR_ELT: {
1988     SDValue Arg = N->getOperand(0);
1989     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1990       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1991         unsigned Element = Const->getZExtValue();
1992         return Arg->getOperand(Element);
1993       }
1994     }
1995     if (Arg.getOpcode() == ISD::BITCAST &&
1996         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1997       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1998         unsigned Element = Const->getZExtValue();
1999         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
2000             Arg->getOperand(0).getOperand(Element));
2001       }
2002     }
2003   }
2004
2005   case ISD::SELECT_CC: {
2006     // Try common optimizations
2007     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2008     if (Ret.getNode())
2009       return Ret;
2010
2011     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
2012     //      selectcc x, y, a, b, inv(cc)
2013     //
2014     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
2015     //      selectcc x, y, a, b, cc
2016     SDValue LHS = N->getOperand(0);
2017     if (LHS.getOpcode() != ISD::SELECT_CC) {
2018       return SDValue();
2019     }
2020
2021     SDValue RHS = N->getOperand(1);
2022     SDValue True = N->getOperand(2);
2023     SDValue False = N->getOperand(3);
2024     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2025
2026     if (LHS.getOperand(2).getNode() != True.getNode() ||
2027         LHS.getOperand(3).getNode() != False.getNode() ||
2028         RHS.getNode() != False.getNode()) {
2029       return SDValue();
2030     }
2031
2032     switch (NCC) {
2033     default: return SDValue();
2034     case ISD::SETNE: return LHS;
2035     case ISD::SETEQ: {
2036       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2037       LHSCC = ISD::getSetCCInverse(LHSCC,
2038                                   LHS.getOperand(0).getValueType().isInteger());
2039       if (DCI.isBeforeLegalizeOps() ||
2040           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2041         return DAG.getSelectCC(SDLoc(N),
2042                                LHS.getOperand(0),
2043                                LHS.getOperand(1),
2044                                LHS.getOperand(2),
2045                                LHS.getOperand(3),
2046                                LHSCC);
2047       break;
2048     }
2049     }
2050     return SDValue();
2051   }
2052
2053   case AMDGPUISD::EXPORT: {
2054     SDValue Arg = N->getOperand(1);
2055     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2056       break;
2057
2058     SDValue NewArgs[8] = {
2059       N->getOperand(0), // Chain
2060       SDValue(),
2061       N->getOperand(2), // ArrayBase
2062       N->getOperand(3), // Type
2063       N->getOperand(4), // SWZ_X
2064       N->getOperand(5), // SWZ_Y
2065       N->getOperand(6), // SWZ_Z
2066       N->getOperand(7) // SWZ_W
2067     };
2068     SDLoc DL(N);
2069     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2070     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2071   }
2072   case AMDGPUISD::TEXTURE_FETCH: {
2073     SDValue Arg = N->getOperand(1);
2074     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2075       break;
2076
2077     SDValue NewArgs[19] = {
2078       N->getOperand(0),
2079       N->getOperand(1),
2080       N->getOperand(2),
2081       N->getOperand(3),
2082       N->getOperand(4),
2083       N->getOperand(5),
2084       N->getOperand(6),
2085       N->getOperand(7),
2086       N->getOperand(8),
2087       N->getOperand(9),
2088       N->getOperand(10),
2089       N->getOperand(11),
2090       N->getOperand(12),
2091       N->getOperand(13),
2092       N->getOperand(14),
2093       N->getOperand(15),
2094       N->getOperand(16),
2095       N->getOperand(17),
2096       N->getOperand(18),
2097     };
2098     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2099     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2100         NewArgs);
2101   }
2102   }
2103
2104   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2105 }
2106
2107 static bool
2108 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2109             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2110   const R600InstrInfo *TII =
2111       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2112   if (!Src.isMachineOpcode())
2113     return false;
2114   switch (Src.getMachineOpcode()) {
2115   case AMDGPU::FNEG_R600:
2116     if (!Neg.getNode())
2117       return false;
2118     Src = Src.getOperand(0);
2119     Neg = DAG.getTargetConstant(1, MVT::i32);
2120     return true;
2121   case AMDGPU::FABS_R600:
2122     if (!Abs.getNode())
2123       return false;
2124     Src = Src.getOperand(0);
2125     Abs = DAG.getTargetConstant(1, MVT::i32);
2126     return true;
2127   case AMDGPU::CONST_COPY: {
2128     unsigned Opcode = ParentNode->getMachineOpcode();
2129     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2130
2131     if (!Sel.getNode())
2132       return false;
2133
2134     SDValue CstOffset = Src.getOperand(0);
2135     if (ParentNode->getValueType(0).isVector())
2136       return false;
2137
2138     // Gather constants values
2139     int SrcIndices[] = {
2140       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2141       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2142       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2143       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2144       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2145       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2146       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2147       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2148       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2149       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2150       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2151     };
2152     std::vector<unsigned> Consts;
2153     for (int OtherSrcIdx : SrcIndices) {
2154       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2155       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2156         continue;
2157       if (HasDst) {
2158         OtherSrcIdx--;
2159         OtherSelIdx--;
2160       }
2161       if (RegisterSDNode *Reg =
2162           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2163         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2164           ConstantSDNode *Cst
2165             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2166           Consts.push_back(Cst->getZExtValue());
2167         }
2168       }
2169     }
2170
2171     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2172     Consts.push_back(Cst->getZExtValue());
2173     if (!TII->fitsConstReadLimitations(Consts)) {
2174       return false;
2175     }
2176
2177     Sel = CstOffset;
2178     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2179     return true;
2180   }
2181   case AMDGPU::MOV_IMM_I32:
2182   case AMDGPU::MOV_IMM_F32: {
2183     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2184     uint64_t ImmValue = 0;
2185
2186
2187     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2188       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2189       float FloatValue = FPC->getValueAPF().convertToFloat();
2190       if (FloatValue == 0.0) {
2191         ImmReg = AMDGPU::ZERO;
2192       } else if (FloatValue == 0.5) {
2193         ImmReg = AMDGPU::HALF;
2194       } else if (FloatValue == 1.0) {
2195         ImmReg = AMDGPU::ONE;
2196       } else {
2197         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2198       }
2199     } else {
2200       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2201       uint64_t Value = C->getZExtValue();
2202       if (Value == 0) {
2203         ImmReg = AMDGPU::ZERO;
2204       } else if (Value == 1) {
2205         ImmReg = AMDGPU::ONE_INT;
2206       } else {
2207         ImmValue = Value;
2208       }
2209     }
2210
2211     // Check that we aren't already using an immediate.
2212     // XXX: It's possible for an instruction to have more than one
2213     // immediate operand, but this is not supported yet.
2214     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2215       if (!Imm.getNode())
2216         return false;
2217       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2218       assert(C);
2219       if (C->getZExtValue())
2220         return false;
2221       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2222     }
2223     Src = DAG.getRegister(ImmReg, MVT::i32);
2224     return true;
2225   }
2226   default:
2227     return false;
2228   }
2229 }
2230
2231
2232 /// \brief Fold the instructions after selecting them
2233 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2234                                             SelectionDAG &DAG) const {
2235   const R600InstrInfo *TII =
2236       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2237   if (!Node->isMachineOpcode())
2238     return Node;
2239   unsigned Opcode = Node->getMachineOpcode();
2240   SDValue FakeOp;
2241
2242   std::vector<SDValue> Ops;
2243   for (const SDUse &I : Node->ops())
2244     Ops.push_back(I);
2245
2246   if (Opcode == AMDGPU::DOT_4) {
2247     int OperandIdx[] = {
2248       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2249       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2250       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2251       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2252       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2253       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2254       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2255       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2256         };
2257     int NegIdx[] = {
2258       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2259       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2260       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2261       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2262       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2263       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2264       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2265       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2266     };
2267     int AbsIdx[] = {
2268       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2269       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2270       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2271       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2272       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2273       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2274       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2275       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2276     };
2277     for (unsigned i = 0; i < 8; i++) {
2278       if (OperandIdx[i] < 0)
2279         return Node;
2280       SDValue &Src = Ops[OperandIdx[i] - 1];
2281       SDValue &Neg = Ops[NegIdx[i] - 1];
2282       SDValue &Abs = Ops[AbsIdx[i] - 1];
2283       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2284       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2285       if (HasDst)
2286         SelIdx--;
2287       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2288       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2289         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2290     }
2291   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2292     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2293       SDValue &Src = Ops[i];
2294       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2295         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2296     }
2297   } else if (Opcode == AMDGPU::CLAMP_R600) {
2298     SDValue Src = Node->getOperand(0);
2299     if (!Src.isMachineOpcode() ||
2300         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2301       return Node;
2302     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2303         AMDGPU::OpName::clamp);
2304     if (ClampIdx < 0)
2305       return Node;
2306     std::vector<SDValue> Ops;
2307     unsigned NumOp = Src.getNumOperands();
2308     for(unsigned i = 0; i < NumOp; ++i)
2309           Ops.push_back(Src.getOperand(i));
2310     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2311     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2312         Node->getVTList(), Ops);
2313   } else {
2314     if (!TII->hasInstrModifiers(Opcode))
2315       return Node;
2316     int OperandIdx[] = {
2317       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2318       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2319       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2320     };
2321     int NegIdx[] = {
2322       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2323       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2324       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2325     };
2326     int AbsIdx[] = {
2327       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2328       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2329       -1
2330     };
2331     for (unsigned i = 0; i < 3; i++) {
2332       if (OperandIdx[i] < 0)
2333         return Node;
2334       SDValue &Src = Ops[OperandIdx[i] - 1];
2335       SDValue &Neg = Ops[NegIdx[i] - 1];
2336       SDValue FakeAbs;
2337       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2338       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2339       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2340       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2341       if (HasDst) {
2342         SelIdx--;
2343         ImmIdx--;
2344       }
2345       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2346       SDValue &Imm = Ops[ImmIdx];
2347       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2348         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2349     }
2350   }
2351
2352   return Node;
2353 }