lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  34     AMDGPUTargetLowering(TM),
  35     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties();
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // Expand sign extension of vectors
  95   if (!Subtarget->hasBFE())
  96     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  97
  98   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 100
 101   if (!Subtarget->hasBFE())
 102     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 105
 106   if (!Subtarget->hasBFE())
 107     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 110
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 114
 115   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 116
 117
 118   // Legalize loads and stores to the private address space.
 119   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 120   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 121   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 122
 123   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 124   // spaces, so it is custom lowered to handle those where it isn't.
 125   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
 126   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
 127   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
 128   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
 129   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
 130   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
 131
 132   setOperationAction(ISD::STORE, MVT::i8, Custom);
 133   setOperationAction(ISD::STORE, MVT::i32, Custom);
 134   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 135   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 136   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 137   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 138
 139   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 140   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 141   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 142
 143   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 144   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 145   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 146   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 147
 148   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 149   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 150   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 151   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 152
 153   setTargetDAGCombine(ISD::FP_ROUND);
 154   setTargetDAGCombine(ISD::FP_TO_SINT);
 155   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 156   setTargetDAGCombine(ISD::SELECT_CC);
 157   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 158
 159   setOperationAction(ISD::SUB, MVT::i64, Expand);
 160
 161   // These should be replaced by UDVIREM, but it does not happen automatically
 162   // during Type Legalization
 163   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 164   setOperationAction(ISD::UREM, MVT::i64, Custom);
 165   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 166   setOperationAction(ISD::SREM, MVT::i64, Custom);
 167
 168   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 169   //  to be Legal/Custom in order to avoid library calls.
 170   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 171   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 172   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 173
 174   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 175
 176   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 177   for (MVT VT : ScalarIntVTs) {
 178     setOperationAction(ISD::ADDC, VT, Expand);
 179     setOperationAction(ISD::SUBC, VT, Expand);
 180     setOperationAction(ISD::ADDE, VT, Expand);
 181     setOperationAction(ISD::SUBE, VT, Expand);
 182   }
 183
 184   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 185   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 186   setSchedulingPreference(Sched::Source);
 187 }
 188
 189 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 190     MachineInstr * MI, MachineBasicBlock * BB) const {
 191   MachineFunction * MF = BB->getParent();
 192   MachineRegisterInfo &MRI = MF->getRegInfo();
 193   MachineBasicBlock::iterator I = *MI;
 194   const R600InstrInfo *TII =
 195       static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
 196
 197   switch (MI->getOpcode()) {
 198   default:
 199     // Replace LDS_*_RET instruction that don't have any uses with the
 200     // equivalent LDS_*_NORET instruction.
 201     if (TII->isLDSRetInstr(MI->getOpcode())) {
 202       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 203       assert(DstIdx != -1);
 204       MachineInstrBuilder NewMI;
 205       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
 206         return BB;
 207
 208       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 209                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 210       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 211         NewMI.addOperand(MI->getOperand(i));
 212       }
 213     } else {
 214       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 215     }
 216     break;
 217   case AMDGPU::CLAMP_R600: {
 218     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 219                                                    AMDGPU::MOV,
 220                                                    MI->getOperand(0).getReg(),
 221                                                    MI->getOperand(1).getReg());
 222     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 223     break;
 224   }
 225
 226   case AMDGPU::FABS_R600: {
 227     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 228                                                     AMDGPU::MOV,
 229                                                     MI->getOperand(0).getReg(),
 230                                                     MI->getOperand(1).getReg());
 231     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 232     break;
 233   }
 234
 235   case AMDGPU::FNEG_R600: {
 236     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 237                                                     AMDGPU::MOV,
 238                                                     MI->getOperand(0).getReg(),
 239                                                     MI->getOperand(1).getReg());
 240     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 241     break;
 242   }
 243
 244   case AMDGPU::MASK_WRITE: {
 245     unsigned maskedRegister = MI->getOperand(0).getReg();
 246     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 247     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 248     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 249     break;
 250   }
 251
 252   case AMDGPU::MOV_IMM_F32:
 253     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 254                      MI->getOperand(1).getFPImm()->getValueAPF()
 255                          .bitcastToAPInt().getZExtValue());
 256     break;
 257   case AMDGPU::MOV_IMM_I32:
 258     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 259                      MI->getOperand(1).getImm());
 260     break;
 261   case AMDGPU::CONST_COPY: {
 262     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 263         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 264     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 265         MI->getOperand(1).getImm());
 266     break;
 267   }
 268
 269   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 270   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 271   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 272     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 273
 274     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 275             .addOperand(MI->getOperand(0))
 276             .addOperand(MI->getOperand(1))
 277             .addImm(EOP); // Set End of program bit
 278     break;
 279   }
 280
 281   case AMDGPU::TXD: {
 282     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 283     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 284     MachineOperand &RID = MI->getOperand(4);
 285     MachineOperand &SID = MI->getOperand(5);
 286     unsigned TextureId = MI->getOperand(6).getImm();
 287     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 288     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 289
 290     switch (TextureId) {
 291     case 5: // Rect
 292       CTX = CTY = 0;
 293       break;
 294     case 6: // Shadow1D
 295       SrcW = SrcZ;
 296       break;
 297     case 7: // Shadow2D
 298       SrcW = SrcZ;
 299       break;
 300     case 8: // ShadowRect
 301       CTX = CTY = 0;
 302       SrcW = SrcZ;
 303       break;
 304     case 9: // 1DArray
 305       SrcZ = SrcY;
 306       CTZ = 0;
 307       break;
 308     case 10: // 2DArray
 309       CTZ = 0;
 310       break;
 311     case 11: // Shadow1DArray
 312       SrcZ = SrcY;
 313       CTZ = 0;
 314       break;
 315     case 12: // Shadow2DArray
 316       CTZ = 0;
 317       break;
 318     }
 319     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 320             .addOperand(MI->getOperand(3))
 321             .addImm(SrcX)
 322             .addImm(SrcY)
 323             .addImm(SrcZ)
 324             .addImm(SrcW)
 325             .addImm(0)
 326             .addImm(0)
 327             .addImm(0)
 328             .addImm(0)
 329             .addImm(1)
 330             .addImm(2)
 331             .addImm(3)
 332             .addOperand(RID)
 333             .addOperand(SID)
 334             .addImm(CTX)
 335             .addImm(CTY)
 336             .addImm(CTZ)
 337             .addImm(CTW);
 338     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 339             .addOperand(MI->getOperand(2))
 340             .addImm(SrcX)
 341             .addImm(SrcY)
 342             .addImm(SrcZ)
 343             .addImm(SrcW)
 344             .addImm(0)
 345             .addImm(0)
 346             .addImm(0)
 347             .addImm(0)
 348             .addImm(1)
 349             .addImm(2)
 350             .addImm(3)
 351             .addOperand(RID)
 352             .addOperand(SID)
 353             .addImm(CTX)
 354             .addImm(CTY)
 355             .addImm(CTZ)
 356             .addImm(CTW);
 357     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 358             .addOperand(MI->getOperand(0))
 359             .addOperand(MI->getOperand(1))
 360             .addImm(SrcX)
 361             .addImm(SrcY)
 362             .addImm(SrcZ)
 363             .addImm(SrcW)
 364             .addImm(0)
 365             .addImm(0)
 366             .addImm(0)
 367             .addImm(0)
 368             .addImm(1)
 369             .addImm(2)
 370             .addImm(3)
 371             .addOperand(RID)
 372             .addOperand(SID)
 373             .addImm(CTX)
 374             .addImm(CTY)
 375             .addImm(CTZ)
 376             .addImm(CTW)
 377             .addReg(T0, RegState::Implicit)
 378             .addReg(T1, RegState::Implicit);
 379     break;
 380   }
 381
 382   case AMDGPU::TXD_SHADOW: {
 383     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 384     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 385     MachineOperand &RID = MI->getOperand(4);
 386     MachineOperand &SID = MI->getOperand(5);
 387     unsigned TextureId = MI->getOperand(6).getImm();
 388     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 389     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 390
 391     switch (TextureId) {
 392     case 5: // Rect
 393       CTX = CTY = 0;
 394       break;
 395     case 6: // Shadow1D
 396       SrcW = SrcZ;
 397       break;
 398     case 7: // Shadow2D
 399       SrcW = SrcZ;
 400       break;
 401     case 8: // ShadowRect
 402       CTX = CTY = 0;
 403       SrcW = SrcZ;
 404       break;
 405     case 9: // 1DArray
 406       SrcZ = SrcY;
 407       CTZ = 0;
 408       break;
 409     case 10: // 2DArray
 410       CTZ = 0;
 411       break;
 412     case 11: // Shadow1DArray
 413       SrcZ = SrcY;
 414       CTZ = 0;
 415       break;
 416     case 12: // Shadow2DArray
 417       CTZ = 0;
 418       break;
 419     }
 420
 421     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 422             .addOperand(MI->getOperand(3))
 423             .addImm(SrcX)
 424             .addImm(SrcY)
 425             .addImm(SrcZ)
 426             .addImm(SrcW)
 427             .addImm(0)
 428             .addImm(0)
 429             .addImm(0)
 430             .addImm(0)
 431             .addImm(1)
 432             .addImm(2)
 433             .addImm(3)
 434             .addOperand(RID)
 435             .addOperand(SID)
 436             .addImm(CTX)
 437             .addImm(CTY)
 438             .addImm(CTZ)
 439             .addImm(CTW);
 440     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 441             .addOperand(MI->getOperand(2))
 442             .addImm(SrcX)
 443             .addImm(SrcY)
 444             .addImm(SrcZ)
 445             .addImm(SrcW)
 446             .addImm(0)
 447             .addImm(0)
 448             .addImm(0)
 449             .addImm(0)
 450             .addImm(1)
 451             .addImm(2)
 452             .addImm(3)
 453             .addOperand(RID)
 454             .addOperand(SID)
 455             .addImm(CTX)
 456             .addImm(CTY)
 457             .addImm(CTZ)
 458             .addImm(CTW);
 459     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 460             .addOperand(MI->getOperand(0))
 461             .addOperand(MI->getOperand(1))
 462             .addImm(SrcX)
 463             .addImm(SrcY)
 464             .addImm(SrcZ)
 465             .addImm(SrcW)
 466             .addImm(0)
 467             .addImm(0)
 468             .addImm(0)
 469             .addImm(0)
 470             .addImm(1)
 471             .addImm(2)
 472             .addImm(3)
 473             .addOperand(RID)
 474             .addOperand(SID)
 475             .addImm(CTX)
 476             .addImm(CTY)
 477             .addImm(CTZ)
 478             .addImm(CTW)
 479             .addReg(T0, RegState::Implicit)
 480             .addReg(T1, RegState::Implicit);
 481     break;
 482   }
 483
 484   case AMDGPU::BRANCH:
 485       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 486               .addOperand(MI->getOperand(0));
 487       break;
 488
 489   case AMDGPU::BRANCH_COND_f32: {
 490     MachineInstr *NewMI =
 491       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 492               AMDGPU::PREDICATE_BIT)
 493               .addOperand(MI->getOperand(1))
 494               .addImm(OPCODE_IS_NOT_ZERO)
 495               .addImm(0); // Flags
 496     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 497     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 498             .addOperand(MI->getOperand(0))
 499             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 500     break;
 501   }
 502
 503   case AMDGPU::BRANCH_COND_i32: {
 504     MachineInstr *NewMI =
 505       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 506             AMDGPU::PREDICATE_BIT)
 507             .addOperand(MI->getOperand(1))
 508             .addImm(OPCODE_IS_NOT_ZERO_INT)
 509             .addImm(0); // Flags
 510     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 511     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 512            .addOperand(MI->getOperand(0))
 513             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 514     break;
 515   }
 516
 517   case AMDGPU::EG_ExportSwz:
 518   case AMDGPU::R600_ExportSwz: {
 519     // Instruction is left unmodified if its not the last one of its type
 520     bool isLastInstructionOfItsType = true;
 521     unsigned InstExportType = MI->getOperand(1).getImm();
 522     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 523          EndBlock = BB->end(); NextExportInst != EndBlock;
 524          NextExportInst = std::next(NextExportInst)) {
 525       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 526           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 527         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 528             .getImm();
 529         if (CurrentInstExportType == InstExportType) {
 530           isLastInstructionOfItsType = false;
 531           break;
 532         }
 533       }
 534     }
 535     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 536     if (!EOP && !isLastInstructionOfItsType)
 537       return BB;
 538     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 539     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 540             .addOperand(MI->getOperand(0))
 541             .addOperand(MI->getOperand(1))
 542             .addOperand(MI->getOperand(2))
 543             .addOperand(MI->getOperand(3))
 544             .addOperand(MI->getOperand(4))
 545             .addOperand(MI->getOperand(5))
 546             .addOperand(MI->getOperand(6))
 547             .addImm(CfInst)
 548             .addImm(EOP);
 549     break;
 550   }
 551   case AMDGPU::RETURN: {
 552     // RETURN instructions must have the live-out registers as implicit uses,
 553     // otherwise they appear dead.
 554     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 555     MachineInstrBuilder MIB(*MF, MI);
 556     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 557       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 558     return BB;
 559   }
 560   }
 561
 562   MI->eraseFromParent();
 563   return BB;
 564 }
 565
 566 //===----------------------------------------------------------------------===//
 567 // Custom DAG Lowering Operations
 568 //===----------------------------------------------------------------------===//
 569
 570 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 571   MachineFunction &MF = DAG.getMachineFunction();
 572   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 573   switch (Op.getOpcode()) {
 574   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 575   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 576   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 577   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 578   case ISD::SRA_PARTS:
 579   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 580   case ISD::FCOS:
 581   case ISD::FSIN: return LowerTrig(Op, DAG);
 582   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 583   case ISD::STORE: return LowerSTORE(Op, DAG);
 584   case ISD::LOAD: {
 585     SDValue Result = LowerLOAD(Op, DAG);
 586     assert((!Result.getNode() ||
 587             Result.getNode()->getNumValues() == 2) &&
 588            "Load should return a value and a chain");
 589     return Result;
 590   }
 591
 592   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 593   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 594   case ISD::INTRINSIC_VOID: {
 595     SDValue Chain = Op.getOperand(0);
 596     unsigned IntrinsicID =
 597                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 598     switch (IntrinsicID) {
 599     case AMDGPUIntrinsic::AMDGPU_store_output: {
 600       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 601       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 602       MFI->LiveOuts.push_back(Reg);
 603       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 604     }
 605     case AMDGPUIntrinsic::R600_store_swizzle: {
 606       const SDValue Args[8] = {
 607         Chain,
 608         Op.getOperand(2), // Export Value
 609         Op.getOperand(3), // ArrayBase
 610         Op.getOperand(4), // Type
 611         DAG.getConstant(0, MVT::i32), // SWZ_X
 612         DAG.getConstant(1, MVT::i32), // SWZ_Y
 613         DAG.getConstant(2, MVT::i32), // SWZ_Z
 614         DAG.getConstant(3, MVT::i32) // SWZ_W
 615       };
 616       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 617     }
 618
 619     // default for switch(IntrinsicID)
 620     default: break;
 621     }
 622     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 623     break;
 624   }
 625   case ISD::INTRINSIC_WO_CHAIN: {
 626     unsigned IntrinsicID =
 627                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 628     EVT VT = Op.getValueType();
 629     SDLoc DL(Op);
 630     switch(IntrinsicID) {
 631     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 632     case AMDGPUIntrinsic::R600_load_input: {
 633       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 634       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 635       MachineFunction &MF = DAG.getMachineFunction();
 636       MachineRegisterInfo &MRI = MF.getRegInfo();
 637       MRI.addLiveIn(Reg);
 638       return DAG.getCopyFromReg(DAG.getEntryNode(),
 639           SDLoc(DAG.getEntryNode()), Reg, VT);
 640     }
 641
 642     case AMDGPUIntrinsic::R600_interp_input: {
 643       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 644       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 645       MachineSDNode *interp;
 646       if (ijb < 0) {
 647         const MachineFunction &MF = DAG.getMachineFunction();
 648         const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
 649             MF.getSubtarget().getInstrInfo());
 650         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 651             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 652         return DAG.getTargetExtractSubreg(
 653             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 654             DL, MVT::f32, SDValue(interp, 0));
 655       }
 656       MachineFunction &MF = DAG.getMachineFunction();
 657       MachineRegisterInfo &MRI = MF.getRegInfo();
 658       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 659       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 660       MRI.addLiveIn(RegisterI);
 661       MRI.addLiveIn(RegisterJ);
 662       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 663           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 664       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 665           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 666
 667       if (slot % 4 < 2)
 668         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 669             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 670             RegisterJNode, RegisterINode);
 671       else
 672         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 673             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 674             RegisterJNode, RegisterINode);
 675       return SDValue(interp, slot % 2);
 676     }
 677     case AMDGPUIntrinsic::R600_interp_xy:
 678     case AMDGPUIntrinsic::R600_interp_zw: {
 679       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 680       MachineSDNode *interp;
 681       SDValue RegisterINode = Op.getOperand(2);
 682       SDValue RegisterJNode = Op.getOperand(3);
 683
 684       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 685         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 686             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 687             RegisterJNode, RegisterINode);
 688       else
 689         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 690             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 691             RegisterJNode, RegisterINode);
 692       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 693           SDValue(interp, 0), SDValue(interp, 1));
 694     }
 695     case AMDGPUIntrinsic::R600_tex:
 696     case AMDGPUIntrinsic::R600_texc:
 697     case AMDGPUIntrinsic::R600_txl:
 698     case AMDGPUIntrinsic::R600_txlc:
 699     case AMDGPUIntrinsic::R600_txb:
 700     case AMDGPUIntrinsic::R600_txbc:
 701     case AMDGPUIntrinsic::R600_txf:
 702     case AMDGPUIntrinsic::R600_txq:
 703     case AMDGPUIntrinsic::R600_ddx:
 704     case AMDGPUIntrinsic::R600_ddy:
 705     case AMDGPUIntrinsic::R600_ldptr: {
 706       unsigned TextureOp;
 707       switch (IntrinsicID) {
 708       case AMDGPUIntrinsic::R600_tex:
 709         TextureOp = 0;
 710         break;
 711       case AMDGPUIntrinsic::R600_texc:
 712         TextureOp = 1;
 713         break;
 714       case AMDGPUIntrinsic::R600_txl:
 715         TextureOp = 2;
 716         break;
 717       case AMDGPUIntrinsic::R600_txlc:
 718         TextureOp = 3;
 719         break;
 720       case AMDGPUIntrinsic::R600_txb:
 721         TextureOp = 4;
 722         break;
 723       case AMDGPUIntrinsic::R600_txbc:
 724         TextureOp = 5;
 725         break;
 726       case AMDGPUIntrinsic::R600_txf:
 727         TextureOp = 6;
 728         break;
 729       case AMDGPUIntrinsic::R600_txq:
 730         TextureOp = 7;
 731         break;
 732       case AMDGPUIntrinsic::R600_ddx:
 733         TextureOp = 8;
 734         break;
 735       case AMDGPUIntrinsic::R600_ddy:
 736         TextureOp = 9;
 737         break;
 738       case AMDGPUIntrinsic::R600_ldptr:
 739         TextureOp = 10;
 740         break;
 741       default:
 742         llvm_unreachable("Unknow Texture Operation");
 743       }
 744
 745       SDValue TexArgs[19] = {
 746         DAG.getConstant(TextureOp, MVT::i32),
 747         Op.getOperand(1),
 748         DAG.getConstant(0, MVT::i32),
 749         DAG.getConstant(1, MVT::i32),
 750         DAG.getConstant(2, MVT::i32),
 751         DAG.getConstant(3, MVT::i32),
 752         Op.getOperand(2),
 753         Op.getOperand(3),
 754         Op.getOperand(4),
 755         DAG.getConstant(0, MVT::i32),
 756         DAG.getConstant(1, MVT::i32),
 757         DAG.getConstant(2, MVT::i32),
 758         DAG.getConstant(3, MVT::i32),
 759         Op.getOperand(5),
 760         Op.getOperand(6),
 761         Op.getOperand(7),
 762         Op.getOperand(8),
 763         Op.getOperand(9),
 764         Op.getOperand(10)
 765       };
 766       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 767     }
 768     case AMDGPUIntrinsic::AMDGPU_dp4: {
 769       SDValue Args[8] = {
 770       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 771           DAG.getConstant(0, MVT::i32)),
 772       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 773           DAG.getConstant(0, MVT::i32)),
 774       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 775           DAG.getConstant(1, MVT::i32)),
 776       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 777           DAG.getConstant(1, MVT::i32)),
 778       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 779           DAG.getConstant(2, MVT::i32)),
 780       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 781           DAG.getConstant(2, MVT::i32)),
 782       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 783           DAG.getConstant(3, MVT::i32)),
 784       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 785           DAG.getConstant(3, MVT::i32))
 786       };
 787       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 788     }
 789
 790     case Intrinsic::r600_read_ngroups_x:
 791       return LowerImplicitParameter(DAG, VT, DL, 0);
 792     case Intrinsic::r600_read_ngroups_y:
 793       return LowerImplicitParameter(DAG, VT, DL, 1);
 794     case Intrinsic::r600_read_ngroups_z:
 795       return LowerImplicitParameter(DAG, VT, DL, 2);
 796     case Intrinsic::r600_read_global_size_x:
 797       return LowerImplicitParameter(DAG, VT, DL, 3);
 798     case Intrinsic::r600_read_global_size_y:
 799       return LowerImplicitParameter(DAG, VT, DL, 4);
 800     case Intrinsic::r600_read_global_size_z:
 801       return LowerImplicitParameter(DAG, VT, DL, 5);
 802     case Intrinsic::r600_read_local_size_x:
 803       return LowerImplicitParameter(DAG, VT, DL, 6);
 804     case Intrinsic::r600_read_local_size_y:
 805       return LowerImplicitParameter(DAG, VT, DL, 7);
 806     case Intrinsic::r600_read_local_size_z:
 807       return LowerImplicitParameter(DAG, VT, DL, 8);
 808
 809     case Intrinsic::r600_read_tgid_x:
 810       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 811                                   AMDGPU::T1_X, VT);
 812     case Intrinsic::r600_read_tgid_y:
 813       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 814                                   AMDGPU::T1_Y, VT);
 815     case Intrinsic::r600_read_tgid_z:
 816       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 817                                   AMDGPU::T1_Z, VT);
 818     case Intrinsic::r600_read_tidig_x:
 819       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 820                                   AMDGPU::T0_X, VT);
 821     case Intrinsic::r600_read_tidig_y:
 822       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 823                                   AMDGPU::T0_Y, VT);
 824     case Intrinsic::r600_read_tidig_z:
 825       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 826                                   AMDGPU::T0_Z, VT);
 827     case Intrinsic::AMDGPU_rsq:
 828       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 829       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 830     }
 831     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 832     break;
 833   }
 834   } // end switch(Op.getOpcode())
 835   return SDValue();
 836 }
 837
 838 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 839                                             SmallVectorImpl<SDValue> &Results,
 840                                             SelectionDAG &DAG) const {
 841   switch (N->getOpcode()) {
 842   default:
 843     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 844     return;
 845   case ISD::FP_TO_UINT:
 846     if (N->getValueType(0) == MVT::i1) {
 847       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 848       return;
 849     }
 850     // Fall-through. Since we don't care about out of bounds values
 851     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 852     // considers some extra cases which are not necessary here.
 853   case ISD::FP_TO_SINT: {
 854     SDValue Result;
 855     if (expandFP_TO_SINT(N, Result, DAG))
 856       Results.push_back(Result);
 857     return;
 858   }
 859   case ISD::UDIV: {
 860     SDValue Op = SDValue(N, 0);
 861     SDLoc DL(Op);
 862     EVT VT = Op.getValueType();
 863     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 864       N->getOperand(0), N->getOperand(1));
 865     Results.push_back(UDIVREM);
 866     break;
 867   }
 868   case ISD::UREM: {
 869     SDValue Op = SDValue(N, 0);
 870     SDLoc DL(Op);
 871     EVT VT = Op.getValueType();
 872     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 873       N->getOperand(0), N->getOperand(1));
 874     Results.push_back(UDIVREM.getValue(1));
 875     break;
 876   }
 877   case ISD::SDIV: {
 878     SDValue Op = SDValue(N, 0);
 879     SDLoc DL(Op);
 880     EVT VT = Op.getValueType();
 881     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 882       N->getOperand(0), N->getOperand(1));
 883     Results.push_back(SDIVREM);
 884     break;
 885   }
 886   case ISD::SREM: {
 887     SDValue Op = SDValue(N, 0);
 888     SDLoc DL(Op);
 889     EVT VT = Op.getValueType();
 890     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 891       N->getOperand(0), N->getOperand(1));
 892     Results.push_back(SDIVREM.getValue(1));
 893     break;
 894   }
 895   case ISD::SDIVREM: {
 896     SDValue Op = SDValue(N, 1);
 897     SDValue RES = LowerSDIVREM(Op, DAG);
 898     Results.push_back(RES);
 899     Results.push_back(RES.getValue(1));
 900     break;
 901   }
 902   case ISD::UDIVREM: {
 903     SDValue Op = SDValue(N, 0);
 904     SDLoc DL(Op);
 905     EVT VT = Op.getValueType();
 906     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
 907
 908     SDValue one = DAG.getConstant(1, HalfVT);
 909     SDValue zero = DAG.getConstant(0, HalfVT);
 910
 911     //HiLo split
 912     SDValue LHS = N->getOperand(0);
 913     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
 914     SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
 915
 916     SDValue RHS = N->getOperand(1);
 917     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
 918     SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
 919
 920     // Get Speculative values
 921     SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
 922     SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
 923
 924     SDValue REM_Hi = zero;
 925     SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
 926
 927     SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
 928     SDValue DIV_Lo = zero;
 929
 930     const unsigned halfBitWidth = HalfVT.getSizeInBits();
 931
 932     for (unsigned i = 0; i < halfBitWidth; ++i) {
 933       SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
 934       // Get Value of high bit
 935       SDValue HBit;
 936       if (halfBitWidth == 32 && Subtarget->hasBFE()) {
 937         HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
 938       } else {
 939         HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
 940         HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
 941       }
 942
 943       SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
 944         DAG.getConstant(halfBitWidth - 1, HalfVT));
 945       REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
 946       REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
 947
 948       REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
 949       REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
 950
 951
 952       SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 953
 954       SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
 955       SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
 956
 957       DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
 958
 959       // Update REM
 960
 961       SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
 962
 963       REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
 964       REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
 965       REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
 966     }
 967
 968     SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 969     SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
 970     Results.push_back(DIV);
 971     Results.push_back(REM);
 972     break;
 973   }
 974   }
 975 }
 976
 977 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 978                                                    SDValue Vector) const {
 979
 980   SDLoc DL(Vector);
 981   EVT VecVT = Vector.getValueType();
 982   EVT EltVT = VecVT.getVectorElementType();
 983   SmallVector<SDValue, 8> Args;
 984
 985   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 986                                                            i != e; ++i) {
 987     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 988                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 989   }
 990
 991   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 992 }
 993
 994 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 995                                                     SelectionDAG &DAG) const {
 996
 997   SDLoc DL(Op);
 998   SDValue Vector = Op.getOperand(0);
 999   SDValue Index = Op.getOperand(1);
1000
1001   if (isa<ConstantSDNode>(Index) ||
1002       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
1003     return Op;
1004
1005   Vector = vectorToVerticalVector(DAG, Vector);
1006   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
1007                      Vector, Index);
1008 }
1009
1010 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
1011                                                    SelectionDAG &DAG) const {
1012   SDLoc DL(Op);
1013   SDValue Vector = Op.getOperand(0);
1014   SDValue Value = Op.getOperand(1);
1015   SDValue Index = Op.getOperand(2);
1016
1017   if (isa<ConstantSDNode>(Index) ||
1018       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
1019     return Op;
1020
1021   Vector = vectorToVerticalVector(DAG, Vector);
1022   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
1023                                Vector, Value, Index);
1024   return vectorToVerticalVector(DAG, Insert);
1025 }
1026
1027 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
1028   // On hw >= R700, COS/SIN input must be between -1. and 1.
1029   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
1030   EVT VT = Op.getValueType();
1031   SDValue Arg = Op.getOperand(0);
1032   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
1033       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
1034         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
1035           DAG.getConstantFP(0.15915494309, MVT::f32)),
1036         DAG.getConstantFP(0.5, MVT::f32)));
1037   unsigned TrigNode;
1038   switch (Op.getOpcode()) {
1039   case ISD::FCOS:
1040     TrigNode = AMDGPUISD::COS_HW;
1041     break;
1042   case ISD::FSIN:
1043     TrigNode = AMDGPUISD::SIN_HW;
1044     break;
1045   default:
1046     llvm_unreachable("Wrong trig opcode");
1047   }
1048   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
1049       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
1050         DAG.getConstantFP(-0.5, MVT::f32)));
1051   if (Gen >= AMDGPUSubtarget::R700)
1052     return TrigVal;
1053   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1054   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1055       DAG.getConstantFP(3.14159265359, MVT::f32));
1056 }
1057
1058 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1059   SDLoc DL(Op);
1060   EVT VT = Op.getValueType();
1061
1062   SDValue Lo = Op.getOperand(0);
1063   SDValue Hi = Op.getOperand(1);
1064   SDValue Shift = Op.getOperand(2);
1065   SDValue Zero = DAG.getConstant(0, VT);
1066   SDValue One  = DAG.getConstant(1, VT);
1067
1068   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1069   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1070   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1071   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1072
1073   // The dance around Width1 is necessary for 0 special case.
1074   // Without it the CompShift might be 32, producing incorrect results in
1075   // Overflow. So we do the shift in two steps, the alternative is to
1076   // add a conditional to filter the special case.
1077
1078   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1079   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1080
1081   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1082   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1083   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1084
1085   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1086   SDValue LoBig = Zero;
1087
1088   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1089   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1090
1091   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1092 }
1093
1094 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1095   SDLoc DL(Op);
1096   EVT VT = Op.getValueType();
1097
1098   SDValue Lo = Op.getOperand(0);
1099   SDValue Hi = Op.getOperand(1);
1100   SDValue Shift = Op.getOperand(2);
1101   SDValue Zero = DAG.getConstant(0, VT);
1102   SDValue One  = DAG.getConstant(1, VT);
1103
1104   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1105
1106   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1107   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1108   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1109   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1110
1111   // The dance around Width1 is necessary for 0 special case.
1112   // Without it the CompShift might be 32, producing incorrect results in
1113   // Overflow. So we do the shift in two steps, the alternative is to
1114   // add a conditional to filter the special case.
1115
1116   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1117   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1118
1119   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1120   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1121   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1122
1123   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1124   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1125
1126   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1127   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1128
1129   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1130 }
1131
1132 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1133   return DAG.getNode(
1134       ISD::SETCC,
1135       SDLoc(Op),
1136       MVT::i1,
1137       Op, DAG.getConstantFP(0.0f, MVT::f32),
1138       DAG.getCondCode(ISD::SETNE)
1139       );
1140 }
1141
1142 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1143                                                    SDLoc DL,
1144                                                    unsigned DwordOffset) const {
1145   unsigned ByteOffset = DwordOffset * 4;
1146   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1147                                       AMDGPUAS::CONSTANT_BUFFER_0);
1148
1149   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1150   assert(isInt<16>(ByteOffset));
1151
1152   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1153                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1154                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1155                      false, false, false, 0);
1156 }
1157
1158 bool R600TargetLowering::isZero(SDValue Op) const {
1159   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1160     return Cst->isNullValue();
1161   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1162     return CstFP->isZero();
1163   } else {
1164     return false;
1165   }
1166 }
1167
1168 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1169   SDLoc DL(Op);
1170   EVT VT = Op.getValueType();
1171
1172   SDValue LHS = Op.getOperand(0);
1173   SDValue RHS = Op.getOperand(1);
1174   SDValue True = Op.getOperand(2);
1175   SDValue False = Op.getOperand(3);
1176   SDValue CC = Op.getOperand(4);
1177   SDValue Temp;
1178
1179   // LHS and RHS are guaranteed to be the same value type
1180   EVT CompareVT = LHS.getValueType();
1181
1182   // Check if we can lower this to a native operation.
1183
1184   // Try to lower to a SET* instruction:
1185   //
1186   // SET* can match the following patterns:
1187   //
1188   // select_cc f32, f32, -1,  0, cc_supported
1189   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1190   // select_cc i32, i32, -1,  0, cc_supported
1191   //
1192
1193   // Move hardware True/False values to the correct operand.
1194   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1195   ISD::CondCode InverseCC =
1196      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1197   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1198     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1199       std::swap(False, True);
1200       CC = DAG.getCondCode(InverseCC);
1201     } else {
1202       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1203       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1204         std::swap(False, True);
1205         std::swap(LHS, RHS);
1206         CC = DAG.getCondCode(SwapInvCC);
1207       }
1208     }
1209   }
1210
1211   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1212       (CompareVT == VT || VT == MVT::i32)) {
1213     // This can be matched by a SET* instruction.
1214     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1215   }
1216
1217   // Try to lower to a CND* instruction:
1218   //
1219   // CND* can match the following patterns:
1220   //
1221   // select_cc f32, 0.0, f32, f32, cc_supported
1222   // select_cc f32, 0.0, i32, i32, cc_supported
1223   // select_cc i32, 0,   f32, f32, cc_supported
1224   // select_cc i32, 0,   i32, i32, cc_supported
1225   //
1226
1227   // Try to move the zero value to the RHS
1228   if (isZero(LHS)) {
1229     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1230     // Try swapping the operands
1231     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1232     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1233       std::swap(LHS, RHS);
1234       CC = DAG.getCondCode(CCSwapped);
1235     } else {
1236       // Try inverting the conditon and then swapping the operands
1237       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1238       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1239       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1240         std::swap(True, False);
1241         std::swap(LHS, RHS);
1242         CC = DAG.getCondCode(CCSwapped);
1243       }
1244     }
1245   }
1246   if (isZero(RHS)) {
1247     SDValue Cond = LHS;
1248     SDValue Zero = RHS;
1249     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1250     if (CompareVT != VT) {
1251       // Bitcast True / False to the correct types.  This will end up being
1252       // a nop, but it allows us to define only a single pattern in the
1253       // .TD files for each CND* instruction rather than having to have
1254       // one pattern for integer True/False and one for fp True/False
1255       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1256       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1257     }
1258
1259     switch (CCOpcode) {
1260     case ISD::SETONE:
1261     case ISD::SETUNE:
1262     case ISD::SETNE:
1263       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1264       Temp = True;
1265       True = False;
1266       False = Temp;
1267       break;
1268     default:
1269       break;
1270     }
1271     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1272         Cond, Zero,
1273         True, False,
1274         DAG.getCondCode(CCOpcode));
1275     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1276   }
1277
1278   // If we make it this for it means we have no native instructions to handle
1279   // this SELECT_CC, so we must lower it.
1280   SDValue HWTrue, HWFalse;
1281
1282   if (CompareVT == MVT::f32) {
1283     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1284     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1285   } else if (CompareVT == MVT::i32) {
1286     HWTrue = DAG.getConstant(-1, CompareVT);
1287     HWFalse = DAG.getConstant(0, CompareVT);
1288   }
1289   else {
1290     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1291   }
1292
1293   // Lower this unsupported SELECT_CC into a combination of two supported
1294   // SELECT_CC operations.
1295   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1296
1297   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1298       Cond, HWFalse,
1299       True, False,
1300       DAG.getCondCode(ISD::SETNE));
1301 }
1302
1303 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1304 /// convert these pointers to a register index.  Each register holds
1305 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1306 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1307 /// for indirect addressing.
1308 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1309                                                unsigned StackWidth,
1310                                                SelectionDAG &DAG) const {
1311   unsigned SRLPad;
1312   switch(StackWidth) {
1313   case 1:
1314     SRLPad = 2;
1315     break;
1316   case 2:
1317     SRLPad = 3;
1318     break;
1319   case 4:
1320     SRLPad = 4;
1321     break;
1322   default: llvm_unreachable("Invalid stack width");
1323   }
1324
1325   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1326                      DAG.getConstant(SRLPad, MVT::i32));
1327 }
1328
1329 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1330                                          unsigned ElemIdx,
1331                                          unsigned &Channel,
1332                                          unsigned &PtrIncr) const {
1333   switch (StackWidth) {
1334   default:
1335   case 1:
1336     Channel = 0;
1337     if (ElemIdx > 0) {
1338       PtrIncr = 1;
1339     } else {
1340       PtrIncr = 0;
1341     }
1342     break;
1343   case 2:
1344     Channel = ElemIdx % 2;
1345     if (ElemIdx == 2) {
1346       PtrIncr = 1;
1347     } else {
1348       PtrIncr = 0;
1349     }
1350     break;
1351   case 4:
1352     Channel = ElemIdx;
1353     PtrIncr = 0;
1354     break;
1355   }
1356 }
1357
1358 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1359   SDLoc DL(Op);
1360   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1361   SDValue Chain = Op.getOperand(0);
1362   SDValue Value = Op.getOperand(1);
1363   SDValue Ptr = Op.getOperand(2);
1364
1365   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1366   if (Result.getNode()) {
1367     return Result;
1368   }
1369
1370   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1371     if (StoreNode->isTruncatingStore()) {
1372       EVT VT = Value.getValueType();
1373       assert(VT.bitsLE(MVT::i32));
1374       EVT MemVT = StoreNode->getMemoryVT();
1375       SDValue MaskConstant;
1376       if (MemVT == MVT::i8) {
1377         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1378       } else {
1379         assert(MemVT == MVT::i16);
1380         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1381       }
1382       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1383                                       DAG.getConstant(2, MVT::i32));
1384       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1385                                       DAG.getConstant(0x00000003, VT));
1386       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1387       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1388                                    DAG.getConstant(3, VT));
1389       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1390       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1391       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1392       // vector instead.
1393       SDValue Src[4] = {
1394         ShiftedValue,
1395         DAG.getConstant(0, MVT::i32),
1396         DAG.getConstant(0, MVT::i32),
1397         Mask
1398       };
1399       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1400       SDValue Args[3] = { Chain, Input, DWordAddr };
1401       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1402                                      Op->getVTList(), Args, MemVT,
1403                                      StoreNode->getMemOperand());
1404     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1405                Value.getValueType().bitsGE(MVT::i32)) {
1406       // Convert pointer from byte address to dword address.
1407       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1408                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1409                                     Ptr, DAG.getConstant(2, MVT::i32)));
1410
1411       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1412         llvm_unreachable("Truncated and indexed stores not supported yet");
1413       } else {
1414         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1415       }
1416       return Chain;
1417     }
1418   }
1419
1420   EVT ValueVT = Value.getValueType();
1421
1422   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1423     return SDValue();
1424   }
1425
1426   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1427   if (Ret.getNode()) {
1428     return Ret;
1429   }
1430   // Lowering for indirect addressing
1431
1432   const MachineFunction &MF = DAG.getMachineFunction();
1433   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1434       getTargetMachine().getSubtargetImpl()->getFrameLowering());
1435   unsigned StackWidth = TFL->getStackWidth(MF);
1436
1437   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1438
1439   if (ValueVT.isVector()) {
1440     unsigned NumElemVT = ValueVT.getVectorNumElements();
1441     EVT ElemVT = ValueVT.getVectorElementType();
1442     SmallVector<SDValue, 4> Stores(NumElemVT);
1443
1444     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1445                                       "vector width in load");
1446
1447     for (unsigned i = 0; i < NumElemVT; ++i) {
1448       unsigned Channel, PtrIncr;
1449       getStackAddress(StackWidth, i, Channel, PtrIncr);
1450       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1451                         DAG.getConstant(PtrIncr, MVT::i32));
1452       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1453                                  Value, DAG.getConstant(i, MVT::i32));
1454
1455       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1456                               Chain, Elem, Ptr,
1457                               DAG.getTargetConstant(Channel, MVT::i32));
1458     }
1459      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1460    } else {
1461     if (ValueVT == MVT::i8) {
1462       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1463     }
1464     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1465     DAG.getTargetConstant(0, MVT::i32)); // Channel
1466   }
1467
1468   return Chain;
1469 }
1470
1471 // return (512 + (kc_bank << 12)
1472 static int
1473 ConstantAddressBlock(unsigned AddressSpace) {
1474   switch (AddressSpace) {
1475   case AMDGPUAS::CONSTANT_BUFFER_0:
1476     return 512;
1477   case AMDGPUAS::CONSTANT_BUFFER_1:
1478     return 512 + 4096;
1479   case AMDGPUAS::CONSTANT_BUFFER_2:
1480     return 512 + 4096 * 2;
1481   case AMDGPUAS::CONSTANT_BUFFER_3:
1482     return 512 + 4096 * 3;
1483   case AMDGPUAS::CONSTANT_BUFFER_4:
1484     return 512 + 4096 * 4;
1485   case AMDGPUAS::CONSTANT_BUFFER_5:
1486     return 512 + 4096 * 5;
1487   case AMDGPUAS::CONSTANT_BUFFER_6:
1488     return 512 + 4096 * 6;
1489   case AMDGPUAS::CONSTANT_BUFFER_7:
1490     return 512 + 4096 * 7;
1491   case AMDGPUAS::CONSTANT_BUFFER_8:
1492     return 512 + 4096 * 8;
1493   case AMDGPUAS::CONSTANT_BUFFER_9:
1494     return 512 + 4096 * 9;
1495   case AMDGPUAS::CONSTANT_BUFFER_10:
1496     return 512 + 4096 * 10;
1497   case AMDGPUAS::CONSTANT_BUFFER_11:
1498     return 512 + 4096 * 11;
1499   case AMDGPUAS::CONSTANT_BUFFER_12:
1500     return 512 + 4096 * 12;
1501   case AMDGPUAS::CONSTANT_BUFFER_13:
1502     return 512 + 4096 * 13;
1503   case AMDGPUAS::CONSTANT_BUFFER_14:
1504     return 512 + 4096 * 14;
1505   case AMDGPUAS::CONSTANT_BUFFER_15:
1506     return 512 + 4096 * 15;
1507   default:
1508     return -1;
1509   }
1510 }
1511
1512 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1513 {
1514   EVT VT = Op.getValueType();
1515   SDLoc DL(Op);
1516   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1517   SDValue Chain = Op.getOperand(0);
1518   SDValue Ptr = Op.getOperand(1);
1519   SDValue LoweredLoad;
1520
1521   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1522   if (Ret.getNode()) {
1523     SDValue Ops[2] = {
1524       Ret,
1525       Chain
1526     };
1527     return DAG.getMergeValues(Ops, DL);
1528   }
1529
1530   // Lower loads constant address space global variable loads
1531   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1532       isa<GlobalVariable>(
1533           GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
1534
1535     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1536         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1537     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1538         DAG.getConstant(2, MVT::i32));
1539     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1540                        LoadNode->getChain(), Ptr,
1541                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1542   }
1543
1544   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1545     SDValue MergedValues[2] = {
1546       ScalarizeVectorLoad(Op, DAG),
1547       Chain
1548     };
1549     return DAG.getMergeValues(MergedValues, DL);
1550   }
1551
1552   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1553   if (ConstantBlock > -1 &&
1554       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1555        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1556     SDValue Result;
1557     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1558         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1559         isa<ConstantSDNode>(Ptr)) {
1560       SDValue Slots[4];
1561       for (unsigned i = 0; i < 4; i++) {
1562         // We want Const position encoded with the following formula :
1563         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1564         // const_index is Ptr computed by llvm using an alignment of 16.
1565         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1566         // then div by 4 at the ISel step
1567         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1568             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1569         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1570       }
1571       EVT NewVT = MVT::v4i32;
1572       unsigned NumElements = 4;
1573       if (VT.isVector()) {
1574         NewVT = VT;
1575         NumElements = VT.getVectorNumElements();
1576       }
1577       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1578                            makeArrayRef(Slots, NumElements));
1579     } else {
1580       // non-constant ptr can't be folded, keeps it as a v4f32 load
1581       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1582           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1583           DAG.getConstant(LoadNode->getAddressSpace() -
1584                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1585           );
1586     }
1587
1588     if (!VT.isVector()) {
1589       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1590           DAG.getConstant(0, MVT::i32));
1591     }
1592
1593     SDValue MergedValues[2] = {
1594       Result,
1595       Chain
1596     };
1597     return DAG.getMergeValues(MergedValues, DL);
1598   }
1599
1600   // For most operations returning SDValue() will result in the node being
1601   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1602   // need to manually expand loads that may be legal in some address spaces and
1603   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1604   // compute shaders, since the data is sign extended when it is uploaded to the
1605   // buffer. However SEXT loads from other address spaces are not supported, so
1606   // we need to expand them here.
1607   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1608     EVT MemVT = LoadNode->getMemoryVT();
1609     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1610     SDValue ShiftAmount =
1611           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1612     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1613                                   LoadNode->getPointerInfo(), MemVT,
1614                                   LoadNode->isVolatile(),
1615                                   LoadNode->isNonTemporal(),
1616                                   LoadNode->isInvariant(),
1617                                   LoadNode->getAlignment());
1618     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1619     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1620
1621     SDValue MergedValues[2] = { Sra, Chain };
1622     return DAG.getMergeValues(MergedValues, DL);
1623   }
1624
1625   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1626     return SDValue();
1627   }
1628
1629   // Lowering for indirect addressing
1630   const MachineFunction &MF = DAG.getMachineFunction();
1631   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1632       getTargetMachine().getSubtargetImpl()->getFrameLowering());
1633   unsigned StackWidth = TFL->getStackWidth(MF);
1634
1635   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1636
1637   if (VT.isVector()) {
1638     unsigned NumElemVT = VT.getVectorNumElements();
1639     EVT ElemVT = VT.getVectorElementType();
1640     SDValue Loads[4];
1641
1642     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1643                                       "vector width in load");
1644
1645     for (unsigned i = 0; i < NumElemVT; ++i) {
1646       unsigned Channel, PtrIncr;
1647       getStackAddress(StackWidth, i, Channel, PtrIncr);
1648       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1649                         DAG.getConstant(PtrIncr, MVT::i32));
1650       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1651                              Chain, Ptr,
1652                              DAG.getTargetConstant(Channel, MVT::i32),
1653                              Op.getOperand(2));
1654     }
1655     for (unsigned i = NumElemVT; i < 4; ++i) {
1656       Loads[i] = DAG.getUNDEF(ElemVT);
1657     }
1658     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1659     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1660   } else {
1661     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1662                               Chain, Ptr,
1663                               DAG.getTargetConstant(0, MVT::i32), // Channel
1664                               Op.getOperand(2));
1665   }
1666
1667   SDValue Ops[2] = {
1668     LoweredLoad,
1669     Chain
1670   };
1671
1672   return DAG.getMergeValues(Ops, DL);
1673 }
1674
1675 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1676   SDValue Chain = Op.getOperand(0);
1677   SDValue Cond  = Op.getOperand(1);
1678   SDValue Jump  = Op.getOperand(2);
1679
1680   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1681                      Chain, Jump, Cond);
1682 }
1683
1684 /// XXX Only kernel functions are supported, so we can assume for now that
1685 /// every function is a kernel function, but in the future we should use
1686 /// separate calling conventions for kernel and non-kernel functions.
1687 SDValue R600TargetLowering::LowerFormalArguments(
1688                                       SDValue Chain,
1689                                       CallingConv::ID CallConv,
1690                                       bool isVarArg,
1691                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1692                                       SDLoc DL, SelectionDAG &DAG,
1693                                       SmallVectorImpl<SDValue> &InVals) const {
1694   SmallVector<CCValAssign, 16> ArgLocs;
1695   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1696                  *DAG.getContext());
1697   MachineFunction &MF = DAG.getMachineFunction();
1698   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType();
1699
1700   SmallVector<ISD::InputArg, 8> LocalIns;
1701
1702   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1703
1704   AnalyzeFormalArguments(CCInfo, LocalIns);
1705
1706   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1707     CCValAssign &VA = ArgLocs[i];
1708     EVT VT = Ins[i].VT;
1709     EVT MemVT = LocalIns[i].VT;
1710
1711     if (ShaderType != ShaderType::COMPUTE) {
1712       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1713       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1714       InVals.push_back(Register);
1715       continue;
1716     }
1717
1718     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1719                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1720
1721     // i64 isn't a legal type, so the register type used ends up as i32, which
1722     // isn't expected here. It attempts to create this sextload, but it ends up
1723     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1724     // for <1 x i64>.
1725
1726     // The first 36 bytes of the input buffer contains information about
1727     // thread group and global sizes.
1728
1729     // FIXME: This should really check the extload type, but the handling of
1730     // extload vecto parameters seems to be broken.
1731     //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1732     ISD::LoadExtType Ext = ISD::SEXTLOAD;
1733     SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
1734                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1735                                  MachinePointerInfo(UndefValue::get(PtrTy)),
1736                                  MemVT, false, false, false, 4);
1737
1738     // 4 is the preferred alignment for the CONSTANT memory space.
1739     InVals.push_back(Arg);
1740   }
1741   return Chain;
1742 }
1743
1744 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1745    if (!VT.isVector())
1746      return MVT::i32;
1747    return VT.changeVectorElementTypeToInteger();
1748 }
1749
1750 static SDValue CompactSwizzlableVector(
1751   SelectionDAG &DAG, SDValue VectorEntry,
1752   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1753   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1754   assert(RemapSwizzle.empty());
1755   SDValue NewBldVec[4] = {
1756     VectorEntry.getOperand(0),
1757     VectorEntry.getOperand(1),
1758     VectorEntry.getOperand(2),
1759     VectorEntry.getOperand(3)
1760   };
1761
1762   for (unsigned i = 0; i < 4; i++) {
1763     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1764       // We mask write here to teach later passes that the ith element of this
1765       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1766       // break false dependencies and additionnaly make assembly easier to read.
1767       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1768     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1769       if (C->isZero()) {
1770         RemapSwizzle[i] = 4; // SEL_0
1771         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1772       } else if (C->isExactlyValue(1.0)) {
1773         RemapSwizzle[i] = 5; // SEL_1
1774         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1775       }
1776     }
1777
1778     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1779       continue;
1780     for (unsigned j = 0; j < i; j++) {
1781       if (NewBldVec[i] == NewBldVec[j]) {
1782         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1783         RemapSwizzle[i] = j;
1784         break;
1785       }
1786     }
1787   }
1788
1789   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1790                      VectorEntry.getValueType(), NewBldVec);
1791 }
1792
1793 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1794                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1795   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1796   assert(RemapSwizzle.empty());
1797   SDValue NewBldVec[4] = {
1798       VectorEntry.getOperand(0),
1799       VectorEntry.getOperand(1),
1800       VectorEntry.getOperand(2),
1801       VectorEntry.getOperand(3)
1802   };
1803   bool isUnmovable[4] = { false, false, false, false };
1804   for (unsigned i = 0; i < 4; i++) {
1805     RemapSwizzle[i] = i;
1806     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1807       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1808           ->getZExtValue();
1809       if (i == Idx)
1810         isUnmovable[Idx] = true;
1811     }
1812   }
1813
1814   for (unsigned i = 0; i < 4; i++) {
1815     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1816       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1817           ->getZExtValue();
1818       if (isUnmovable[Idx])
1819         continue;
1820       // Swap i and Idx
1821       std::swap(NewBldVec[Idx], NewBldVec[i]);
1822       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1823       break;
1824     }
1825   }
1826
1827   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1828                      VectorEntry.getValueType(), NewBldVec);
1829 }
1830
1831
1832 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1833 SDValue Swz[4], SelectionDAG &DAG) const {
1834   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1835   // Old -> New swizzle values
1836   DenseMap<unsigned, unsigned> SwizzleRemap;
1837
1838   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1839   for (unsigned i = 0; i < 4; i++) {
1840     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1841     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1842       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1843   }
1844
1845   SwizzleRemap.clear();
1846   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1847   for (unsigned i = 0; i < 4; i++) {
1848     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1849     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1850       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1851   }
1852
1853   return BuildVector;
1854 }
1855
1856
1857 //===----------------------------------------------------------------------===//
1858 // Custom DAG Optimizations
1859 //===----------------------------------------------------------------------===//
1860
1861 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1862                                               DAGCombinerInfo &DCI) const {
1863   SelectionDAG &DAG = DCI.DAG;
1864
1865   switch (N->getOpcode()) {
1866   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1867   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1868   case ISD::FP_ROUND: {
1869       SDValue Arg = N->getOperand(0);
1870       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1871         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1872                            Arg.getOperand(0));
1873       }
1874       break;
1875     }
1876
1877   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1878   // (i32 select_cc f32, f32, -1, 0 cc)
1879   //
1880   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1881   // this to one of the SET*_DX10 instructions.
1882   case ISD::FP_TO_SINT: {
1883     SDValue FNeg = N->getOperand(0);
1884     if (FNeg.getOpcode() != ISD::FNEG) {
1885       return SDValue();
1886     }
1887     SDValue SelectCC = FNeg.getOperand(0);
1888     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1889         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1890         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1891         !isHWTrueValue(SelectCC.getOperand(2)) ||
1892         !isHWFalseValue(SelectCC.getOperand(3))) {
1893       return SDValue();
1894     }
1895
1896     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1897                            SelectCC.getOperand(0), // LHS
1898                            SelectCC.getOperand(1), // RHS
1899                            DAG.getConstant(-1, MVT::i32), // True
1900                            DAG.getConstant(0, MVT::i32),  // Flase
1901                            SelectCC.getOperand(4)); // CC
1902
1903     break;
1904   }
1905
1906   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1907   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1908   case ISD::INSERT_VECTOR_ELT: {
1909     SDValue InVec = N->getOperand(0);
1910     SDValue InVal = N->getOperand(1);
1911     SDValue EltNo = N->getOperand(2);
1912     SDLoc dl(N);
1913
1914     // If the inserted element is an UNDEF, just use the input vector.
1915     if (InVal.getOpcode() == ISD::UNDEF)
1916       return InVec;
1917
1918     EVT VT = InVec.getValueType();
1919
1920     // If we can't generate a legal BUILD_VECTOR, exit
1921     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1922       return SDValue();
1923
1924     // Check that we know which element is being inserted
1925     if (!isa<ConstantSDNode>(EltNo))
1926       return SDValue();
1927     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1928
1929     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1930     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1931     // vector elements.
1932     SmallVector<SDValue, 8> Ops;
1933     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1934       Ops.append(InVec.getNode()->op_begin(),
1935                  InVec.getNode()->op_end());
1936     } else if (InVec.getOpcode() == ISD::UNDEF) {
1937       unsigned NElts = VT.getVectorNumElements();
1938       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1939     } else {
1940       return SDValue();
1941     }
1942
1943     // Insert the element
1944     if (Elt < Ops.size()) {
1945       // All the operands of BUILD_VECTOR must have the same type;
1946       // we enforce that here.
1947       EVT OpVT = Ops[0].getValueType();
1948       if (InVal.getValueType() != OpVT)
1949         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1950           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1951           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1952       Ops[Elt] = InVal;
1953     }
1954
1955     // Return the new vector
1956     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1957   }
1958
1959   // Extract_vec (Build_vector) generated by custom lowering
1960   // also needs to be customly combined
1961   case ISD::EXTRACT_VECTOR_ELT: {
1962     SDValue Arg = N->getOperand(0);
1963     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1964       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1965         unsigned Element = Const->getZExtValue();
1966         return Arg->getOperand(Element);
1967       }
1968     }
1969     if (Arg.getOpcode() == ISD::BITCAST &&
1970         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1971       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1972         unsigned Element = Const->getZExtValue();
1973         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1974             Arg->getOperand(0).getOperand(Element));
1975       }
1976     }
1977   }
1978
1979   case ISD::SELECT_CC: {
1980     // Try common optimizations
1981     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1982     if (Ret.getNode())
1983       return Ret;
1984
1985     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1986     //      selectcc x, y, a, b, inv(cc)
1987     //
1988     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1989     //      selectcc x, y, a, b, cc
1990     SDValue LHS = N->getOperand(0);
1991     if (LHS.getOpcode() != ISD::SELECT_CC) {
1992       return SDValue();
1993     }
1994
1995     SDValue RHS = N->getOperand(1);
1996     SDValue True = N->getOperand(2);
1997     SDValue False = N->getOperand(3);
1998     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1999
2000     if (LHS.getOperand(2).getNode() != True.getNode() ||
2001         LHS.getOperand(3).getNode() != False.getNode() ||
2002         RHS.getNode() != False.getNode()) {
2003       return SDValue();
2004     }
2005
2006     switch (NCC) {
2007     default: return SDValue();
2008     case ISD::SETNE: return LHS;
2009     case ISD::SETEQ: {
2010       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2011       LHSCC = ISD::getSetCCInverse(LHSCC,
2012                                   LHS.getOperand(0).getValueType().isInteger());
2013       if (DCI.isBeforeLegalizeOps() ||
2014           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2015         return DAG.getSelectCC(SDLoc(N),
2016                                LHS.getOperand(0),
2017                                LHS.getOperand(1),
2018                                LHS.getOperand(2),
2019                                LHS.getOperand(3),
2020                                LHSCC);
2021       break;
2022     }
2023     }
2024     return SDValue();
2025   }
2026
2027   case AMDGPUISD::EXPORT: {
2028     SDValue Arg = N->getOperand(1);
2029     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2030       break;
2031
2032     SDValue NewArgs[8] = {
2033       N->getOperand(0), // Chain
2034       SDValue(),
2035       N->getOperand(2), // ArrayBase
2036       N->getOperand(3), // Type
2037       N->getOperand(4), // SWZ_X
2038       N->getOperand(5), // SWZ_Y
2039       N->getOperand(6), // SWZ_Z
2040       N->getOperand(7) // SWZ_W
2041     };
2042     SDLoc DL(N);
2043     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2044     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2045   }
2046   case AMDGPUISD::TEXTURE_FETCH: {
2047     SDValue Arg = N->getOperand(1);
2048     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2049       break;
2050
2051     SDValue NewArgs[19] = {
2052       N->getOperand(0),
2053       N->getOperand(1),
2054       N->getOperand(2),
2055       N->getOperand(3),
2056       N->getOperand(4),
2057       N->getOperand(5),
2058       N->getOperand(6),
2059       N->getOperand(7),
2060       N->getOperand(8),
2061       N->getOperand(9),
2062       N->getOperand(10),
2063       N->getOperand(11),
2064       N->getOperand(12),
2065       N->getOperand(13),
2066       N->getOperand(14),
2067       N->getOperand(15),
2068       N->getOperand(16),
2069       N->getOperand(17),
2070       N->getOperand(18),
2071     };
2072     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2073     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2074         NewArgs);
2075   }
2076   }
2077
2078   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2079 }
2080
2081 static bool
2082 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2083             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2084   const R600InstrInfo *TII =
2085       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2086   if (!Src.isMachineOpcode())
2087     return false;
2088   switch (Src.getMachineOpcode()) {
2089   case AMDGPU::FNEG_R600:
2090     if (!Neg.getNode())
2091       return false;
2092     Src = Src.getOperand(0);
2093     Neg = DAG.getTargetConstant(1, MVT::i32);
2094     return true;
2095   case AMDGPU::FABS_R600:
2096     if (!Abs.getNode())
2097       return false;
2098     Src = Src.getOperand(0);
2099     Abs = DAG.getTargetConstant(1, MVT::i32);
2100     return true;
2101   case AMDGPU::CONST_COPY: {
2102     unsigned Opcode = ParentNode->getMachineOpcode();
2103     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2104
2105     if (!Sel.getNode())
2106       return false;
2107
2108     SDValue CstOffset = Src.getOperand(0);
2109     if (ParentNode->getValueType(0).isVector())
2110       return false;
2111
2112     // Gather constants values
2113     int SrcIndices[] = {
2114       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2115       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2116       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2117       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2118       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2119       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2120       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2121       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2122       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2123       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2124       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2125     };
2126     std::vector<unsigned> Consts;
2127     for (int OtherSrcIdx : SrcIndices) {
2128       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2129       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2130         continue;
2131       if (HasDst) {
2132         OtherSrcIdx--;
2133         OtherSelIdx--;
2134       }
2135       if (RegisterSDNode *Reg =
2136           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2137         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2138           ConstantSDNode *Cst
2139             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2140           Consts.push_back(Cst->getZExtValue());
2141         }
2142       }
2143     }
2144
2145     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2146     Consts.push_back(Cst->getZExtValue());
2147     if (!TII->fitsConstReadLimitations(Consts)) {
2148       return false;
2149     }
2150
2151     Sel = CstOffset;
2152     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2153     return true;
2154   }
2155   case AMDGPU::MOV_IMM_I32:
2156   case AMDGPU::MOV_IMM_F32: {
2157     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2158     uint64_t ImmValue = 0;
2159
2160
2161     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2162       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2163       float FloatValue = FPC->getValueAPF().convertToFloat();
2164       if (FloatValue == 0.0) {
2165         ImmReg = AMDGPU::ZERO;
2166       } else if (FloatValue == 0.5) {
2167         ImmReg = AMDGPU::HALF;
2168       } else if (FloatValue == 1.0) {
2169         ImmReg = AMDGPU::ONE;
2170       } else {
2171         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2172       }
2173     } else {
2174       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2175       uint64_t Value = C->getZExtValue();
2176       if (Value == 0) {
2177         ImmReg = AMDGPU::ZERO;
2178       } else if (Value == 1) {
2179         ImmReg = AMDGPU::ONE_INT;
2180       } else {
2181         ImmValue = Value;
2182       }
2183     }
2184
2185     // Check that we aren't already using an immediate.
2186     // XXX: It's possible for an instruction to have more than one
2187     // immediate operand, but this is not supported yet.
2188     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2189       if (!Imm.getNode())
2190         return false;
2191       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2192       assert(C);
2193       if (C->getZExtValue())
2194         return false;
2195       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2196     }
2197     Src = DAG.getRegister(ImmReg, MVT::i32);
2198     return true;
2199   }
2200   default:
2201     return false;
2202   }
2203 }
2204
2205
2206 /// \brief Fold the instructions after selecting them
2207 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2208                                             SelectionDAG &DAG) const {
2209   const R600InstrInfo *TII =
2210       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2211   if (!Node->isMachineOpcode())
2212     return Node;
2213   unsigned Opcode = Node->getMachineOpcode();
2214   SDValue FakeOp;
2215
2216   std::vector<SDValue> Ops;
2217   for (const SDUse &I : Node->ops())
2218     Ops.push_back(I);
2219
2220   if (Opcode == AMDGPU::DOT_4) {
2221     int OperandIdx[] = {
2222       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2223       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2224       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2225       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2226       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2227       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2228       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2229       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2230         };
2231     int NegIdx[] = {
2232       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2233       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2234       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2235       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2236       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2237       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2238       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2239       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2240     };
2241     int AbsIdx[] = {
2242       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2243       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2244       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2245       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2246       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2247       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2248       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2249       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2250     };
2251     for (unsigned i = 0; i < 8; i++) {
2252       if (OperandIdx[i] < 0)
2253         return Node;
2254       SDValue &Src = Ops[OperandIdx[i] - 1];
2255       SDValue &Neg = Ops[NegIdx[i] - 1];
2256       SDValue &Abs = Ops[AbsIdx[i] - 1];
2257       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2258       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2259       if (HasDst)
2260         SelIdx--;
2261       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2262       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2263         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2264     }
2265   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2266     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2267       SDValue &Src = Ops[i];
2268       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2269         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2270     }
2271   } else if (Opcode == AMDGPU::CLAMP_R600) {
2272     SDValue Src = Node->getOperand(0);
2273     if (!Src.isMachineOpcode() ||
2274         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2275       return Node;
2276     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2277         AMDGPU::OpName::clamp);
2278     if (ClampIdx < 0)
2279       return Node;
2280     std::vector<SDValue> Ops;
2281     unsigned NumOp = Src.getNumOperands();
2282     for(unsigned i = 0; i < NumOp; ++i)
2283           Ops.push_back(Src.getOperand(i));
2284     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2285     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2286         Node->getVTList(), Ops);
2287   } else {
2288     if (!TII->hasInstrModifiers(Opcode))
2289       return Node;
2290     int OperandIdx[] = {
2291       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2292       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2293       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2294     };
2295     int NegIdx[] = {
2296       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2297       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2298       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2299     };
2300     int AbsIdx[] = {
2301       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2302       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2303       -1
2304     };
2305     for (unsigned i = 0; i < 3; i++) {
2306       if (OperandIdx[i] < 0)
2307         return Node;
2308       SDValue &Src = Ops[OperandIdx[i] - 1];
2309       SDValue &Neg = Ops[NegIdx[i] - 1];
2310       SDValue FakeAbs;
2311       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2312       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2313       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2314       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2315       if (HasDst) {
2316         SelIdx--;
2317         ImmIdx--;
2318       }
2319       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2320       SDValue &Imm = Ops[ImmIdx];
2321       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2322         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2323     }
2324   }
2325
2326   return Node;
2327 }