lib/Target/AMDGPU/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
  34                                        const AMDGPUSubtarget &STI)
  35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties(STI.getRegisterInfo());
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // ADD, SUB overflow.
  95   // TODO: turn these into Legal?
  96   if (Subtarget->hasCARRY())
  97     setOperationAction(ISD::UADDO, MVT::i32, Custom);
  98
  99   if (Subtarget->hasBORROW())
 100     setOperationAction(ISD::USUBO, MVT::i32, Custom);
 101
 102   // Expand sign extension of vectors
 103   if (!Subtarget->hasBFE())
 104     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 105
 106   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
 107   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 108
 109   if (!Subtarget->hasBFE())
 110     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 113
 114   if (!Subtarget->hasBFE())
 115     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 116   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 117   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 118
 119   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 120   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 121   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 122
 123   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 124
 125
 126   // Legalize loads and stores to the private address space.
 127   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 128   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 129   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 130
 131   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 132   // spaces, so it is custom lowered to handle those where it isn't.
 133   for (MVT VT : MVT::integer_valuetypes()) {
 134     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 135     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
 136     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
 137
 138     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 139     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
 140     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
 141
 142     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 143     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
 144     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
 145   }
 146
 147   setOperationAction(ISD::STORE, MVT::i8, Custom);
 148   setOperationAction(ISD::STORE, MVT::i32, Custom);
 149   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 150   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 151   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 152   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 153
 154   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 155   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 156   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 157
 158   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 159   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 160   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 161   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 162
 163   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 164   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 165   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 166   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 167
 168   setTargetDAGCombine(ISD::FP_ROUND);
 169   setTargetDAGCombine(ISD::FP_TO_SINT);
 170   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 171   setTargetDAGCombine(ISD::SELECT_CC);
 172   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 173
 174   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 175   //  to be Legal/Custom in order to avoid library calls.
 176   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 177   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 178   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 179
 180   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 181
 182   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 183   for (MVT VT : ScalarIntVTs) {
 184     setOperationAction(ISD::ADDC, VT, Expand);
 185     setOperationAction(ISD::SUBC, VT, Expand);
 186     setOperationAction(ISD::ADDE, VT, Expand);
 187     setOperationAction(ISD::SUBE, VT, Expand);
 188   }
 189
 190   setSchedulingPreference(Sched::Source);
 191 }
 192
 193 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 194     MachineInstr * MI, MachineBasicBlock * BB) const {
 195   MachineFunction * MF = BB->getParent();
 196   MachineRegisterInfo &MRI = MF->getRegInfo();
 197   MachineBasicBlock::iterator I = *MI;
 198   const R600InstrInfo *TII =
 199       static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 200
 201   switch (MI->getOpcode()) {
 202   default:
 203     // Replace LDS_*_RET instruction that don't have any uses with the
 204     // equivalent LDS_*_NORET instruction.
 205     if (TII->isLDSRetInstr(MI->getOpcode())) {
 206       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 207       assert(DstIdx != -1);
 208       MachineInstrBuilder NewMI;
 209       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 210       //        LDS_1A2D support and remove this special case.
 211       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
 212            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
 213         return BB;
 214
 215       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 216                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 217       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 218         NewMI.addOperand(MI->getOperand(i));
 219       }
 220     } else {
 221       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 222     }
 223     break;
 224   case AMDGPU::CLAMP_R600: {
 225     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 226                                                    AMDGPU::MOV,
 227                                                    MI->getOperand(0).getReg(),
 228                                                    MI->getOperand(1).getReg());
 229     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 230     break;
 231   }
 232
 233   case AMDGPU::FABS_R600: {
 234     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 235                                                     AMDGPU::MOV,
 236                                                     MI->getOperand(0).getReg(),
 237                                                     MI->getOperand(1).getReg());
 238     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 239     break;
 240   }
 241
 242   case AMDGPU::FNEG_R600: {
 243     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 244                                                     AMDGPU::MOV,
 245                                                     MI->getOperand(0).getReg(),
 246                                                     MI->getOperand(1).getReg());
 247     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 248     break;
 249   }
 250
 251   case AMDGPU::MASK_WRITE: {
 252     unsigned maskedRegister = MI->getOperand(0).getReg();
 253     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 254     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 255     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 256     break;
 257   }
 258
 259   case AMDGPU::MOV_IMM_F32:
 260     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 261                      MI->getOperand(1).getFPImm()->getValueAPF()
 262                          .bitcastToAPInt().getZExtValue());
 263     break;
 264   case AMDGPU::MOV_IMM_I32:
 265     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 266                      MI->getOperand(1).getImm());
 267     break;
 268   case AMDGPU::CONST_COPY: {
 269     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 270         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 271     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 272         MI->getOperand(1).getImm());
 273     break;
 274   }
 275
 276   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 277   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 278   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 279     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 280
 281     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 282             .addOperand(MI->getOperand(0))
 283             .addOperand(MI->getOperand(1))
 284             .addImm(EOP); // Set End of program bit
 285     break;
 286   }
 287
 288   case AMDGPU::TXD: {
 289     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 290     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 291     MachineOperand &RID = MI->getOperand(4);
 292     MachineOperand &SID = MI->getOperand(5);
 293     unsigned TextureId = MI->getOperand(6).getImm();
 294     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 295     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 296
 297     switch (TextureId) {
 298     case 5: // Rect
 299       CTX = CTY = 0;
 300       break;
 301     case 6: // Shadow1D
 302       SrcW = SrcZ;
 303       break;
 304     case 7: // Shadow2D
 305       SrcW = SrcZ;
 306       break;
 307     case 8: // ShadowRect
 308       CTX = CTY = 0;
 309       SrcW = SrcZ;
 310       break;
 311     case 9: // 1DArray
 312       SrcZ = SrcY;
 313       CTZ = 0;
 314       break;
 315     case 10: // 2DArray
 316       CTZ = 0;
 317       break;
 318     case 11: // Shadow1DArray
 319       SrcZ = SrcY;
 320       CTZ = 0;
 321       break;
 322     case 12: // Shadow2DArray
 323       CTZ = 0;
 324       break;
 325     }
 326     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 327             .addOperand(MI->getOperand(3))
 328             .addImm(SrcX)
 329             .addImm(SrcY)
 330             .addImm(SrcZ)
 331             .addImm(SrcW)
 332             .addImm(0)
 333             .addImm(0)
 334             .addImm(0)
 335             .addImm(0)
 336             .addImm(1)
 337             .addImm(2)
 338             .addImm(3)
 339             .addOperand(RID)
 340             .addOperand(SID)
 341             .addImm(CTX)
 342             .addImm(CTY)
 343             .addImm(CTZ)
 344             .addImm(CTW);
 345     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 346             .addOperand(MI->getOperand(2))
 347             .addImm(SrcX)
 348             .addImm(SrcY)
 349             .addImm(SrcZ)
 350             .addImm(SrcW)
 351             .addImm(0)
 352             .addImm(0)
 353             .addImm(0)
 354             .addImm(0)
 355             .addImm(1)
 356             .addImm(2)
 357             .addImm(3)
 358             .addOperand(RID)
 359             .addOperand(SID)
 360             .addImm(CTX)
 361             .addImm(CTY)
 362             .addImm(CTZ)
 363             .addImm(CTW);
 364     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 365             .addOperand(MI->getOperand(0))
 366             .addOperand(MI->getOperand(1))
 367             .addImm(SrcX)
 368             .addImm(SrcY)
 369             .addImm(SrcZ)
 370             .addImm(SrcW)
 371             .addImm(0)
 372             .addImm(0)
 373             .addImm(0)
 374             .addImm(0)
 375             .addImm(1)
 376             .addImm(2)
 377             .addImm(3)
 378             .addOperand(RID)
 379             .addOperand(SID)
 380             .addImm(CTX)
 381             .addImm(CTY)
 382             .addImm(CTZ)
 383             .addImm(CTW)
 384             .addReg(T0, RegState::Implicit)
 385             .addReg(T1, RegState::Implicit);
 386     break;
 387   }
 388
 389   case AMDGPU::TXD_SHADOW: {
 390     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 391     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 392     MachineOperand &RID = MI->getOperand(4);
 393     MachineOperand &SID = MI->getOperand(5);
 394     unsigned TextureId = MI->getOperand(6).getImm();
 395     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 396     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 397
 398     switch (TextureId) {
 399     case 5: // Rect
 400       CTX = CTY = 0;
 401       break;
 402     case 6: // Shadow1D
 403       SrcW = SrcZ;
 404       break;
 405     case 7: // Shadow2D
 406       SrcW = SrcZ;
 407       break;
 408     case 8: // ShadowRect
 409       CTX = CTY = 0;
 410       SrcW = SrcZ;
 411       break;
 412     case 9: // 1DArray
 413       SrcZ = SrcY;
 414       CTZ = 0;
 415       break;
 416     case 10: // 2DArray
 417       CTZ = 0;
 418       break;
 419     case 11: // Shadow1DArray
 420       SrcZ = SrcY;
 421       CTZ = 0;
 422       break;
 423     case 12: // Shadow2DArray
 424       CTZ = 0;
 425       break;
 426     }
 427
 428     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 429             .addOperand(MI->getOperand(3))
 430             .addImm(SrcX)
 431             .addImm(SrcY)
 432             .addImm(SrcZ)
 433             .addImm(SrcW)
 434             .addImm(0)
 435             .addImm(0)
 436             .addImm(0)
 437             .addImm(0)
 438             .addImm(1)
 439             .addImm(2)
 440             .addImm(3)
 441             .addOperand(RID)
 442             .addOperand(SID)
 443             .addImm(CTX)
 444             .addImm(CTY)
 445             .addImm(CTZ)
 446             .addImm(CTW);
 447     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 448             .addOperand(MI->getOperand(2))
 449             .addImm(SrcX)
 450             .addImm(SrcY)
 451             .addImm(SrcZ)
 452             .addImm(SrcW)
 453             .addImm(0)
 454             .addImm(0)
 455             .addImm(0)
 456             .addImm(0)
 457             .addImm(1)
 458             .addImm(2)
 459             .addImm(3)
 460             .addOperand(RID)
 461             .addOperand(SID)
 462             .addImm(CTX)
 463             .addImm(CTY)
 464             .addImm(CTZ)
 465             .addImm(CTW);
 466     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 467             .addOperand(MI->getOperand(0))
 468             .addOperand(MI->getOperand(1))
 469             .addImm(SrcX)
 470             .addImm(SrcY)
 471             .addImm(SrcZ)
 472             .addImm(SrcW)
 473             .addImm(0)
 474             .addImm(0)
 475             .addImm(0)
 476             .addImm(0)
 477             .addImm(1)
 478             .addImm(2)
 479             .addImm(3)
 480             .addOperand(RID)
 481             .addOperand(SID)
 482             .addImm(CTX)
 483             .addImm(CTY)
 484             .addImm(CTZ)
 485             .addImm(CTW)
 486             .addReg(T0, RegState::Implicit)
 487             .addReg(T1, RegState::Implicit);
 488     break;
 489   }
 490
 491   case AMDGPU::BRANCH:
 492       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 493               .addOperand(MI->getOperand(0));
 494       break;
 495
 496   case AMDGPU::BRANCH_COND_f32: {
 497     MachineInstr *NewMI =
 498       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 499               AMDGPU::PREDICATE_BIT)
 500               .addOperand(MI->getOperand(1))
 501               .addImm(OPCODE_IS_NOT_ZERO)
 502               .addImm(0); // Flags
 503     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 504     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 505             .addOperand(MI->getOperand(0))
 506             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 507     break;
 508   }
 509
 510   case AMDGPU::BRANCH_COND_i32: {
 511     MachineInstr *NewMI =
 512       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 513             AMDGPU::PREDICATE_BIT)
 514             .addOperand(MI->getOperand(1))
 515             .addImm(OPCODE_IS_NOT_ZERO_INT)
 516             .addImm(0); // Flags
 517     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 518     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 519            .addOperand(MI->getOperand(0))
 520             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 521     break;
 522   }
 523
 524   case AMDGPU::EG_ExportSwz:
 525   case AMDGPU::R600_ExportSwz: {
 526     // Instruction is left unmodified if its not the last one of its type
 527     bool isLastInstructionOfItsType = true;
 528     unsigned InstExportType = MI->getOperand(1).getImm();
 529     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 530          EndBlock = BB->end(); NextExportInst != EndBlock;
 531          NextExportInst = std::next(NextExportInst)) {
 532       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 533           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 534         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 535             .getImm();
 536         if (CurrentInstExportType == InstExportType) {
 537           isLastInstructionOfItsType = false;
 538           break;
 539         }
 540       }
 541     }
 542     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 543     if (!EOP && !isLastInstructionOfItsType)
 544       return BB;
 545     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 546     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 547             .addOperand(MI->getOperand(0))
 548             .addOperand(MI->getOperand(1))
 549             .addOperand(MI->getOperand(2))
 550             .addOperand(MI->getOperand(3))
 551             .addOperand(MI->getOperand(4))
 552             .addOperand(MI->getOperand(5))
 553             .addOperand(MI->getOperand(6))
 554             .addImm(CfInst)
 555             .addImm(EOP);
 556     break;
 557   }
 558   case AMDGPU::RETURN: {
 559     // RETURN instructions must have the live-out registers as implicit uses,
 560     // otherwise they appear dead.
 561     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 562     MachineInstrBuilder MIB(*MF, MI);
 563     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 564       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 565     return BB;
 566   }
 567   }
 568
 569   MI->eraseFromParent();
 570   return BB;
 571 }
 572
 573 //===----------------------------------------------------------------------===//
 574 // Custom DAG Lowering Operations
 575 //===----------------------------------------------------------------------===//
 576
 577 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 578   MachineFunction &MF = DAG.getMachineFunction();
 579   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 580   switch (Op.getOpcode()) {
 581   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 582   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 583   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 584   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 585   case ISD::SRA_PARTS:
 586   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 587   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
 588   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
 589   case ISD::FCOS:
 590   case ISD::FSIN: return LowerTrig(Op, DAG);
 591   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 592   case ISD::STORE: return LowerSTORE(Op, DAG);
 593   case ISD::LOAD: {
 594     SDValue Result = LowerLOAD(Op, DAG);
 595     assert((!Result.getNode() ||
 596             Result.getNode()->getNumValues() == 2) &&
 597            "Load should return a value and a chain");
 598     return Result;
 599   }
 600
 601   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 602   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 603   case ISD::INTRINSIC_VOID: {
 604     SDValue Chain = Op.getOperand(0);
 605     unsigned IntrinsicID =
 606                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 607     switch (IntrinsicID) {
 608     case AMDGPUIntrinsic::AMDGPU_store_output: {
 609       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 610       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 611       MFI->LiveOuts.push_back(Reg);
 612       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 613     }
 614     case AMDGPUIntrinsic::R600_store_swizzle: {
 615       SDLoc DL(Op);
 616       const SDValue Args[8] = {
 617         Chain,
 618         Op.getOperand(2), // Export Value
 619         Op.getOperand(3), // ArrayBase
 620         Op.getOperand(4), // Type
 621         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
 622         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
 623         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
 624         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
 625       };
 626       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
 627     }
 628
 629     // default for switch(IntrinsicID)
 630     default: break;
 631     }
 632     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 633     break;
 634   }
 635   case ISD::INTRINSIC_WO_CHAIN: {
 636     unsigned IntrinsicID =
 637                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 638     EVT VT = Op.getValueType();
 639     SDLoc DL(Op);
 640     switch(IntrinsicID) {
 641     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 642     case AMDGPUIntrinsic::R600_load_input: {
 643       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 644       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 645       MachineFunction &MF = DAG.getMachineFunction();
 646       MachineRegisterInfo &MRI = MF.getRegInfo();
 647       MRI.addLiveIn(Reg);
 648       return DAG.getCopyFromReg(DAG.getEntryNode(),
 649           SDLoc(DAG.getEntryNode()), Reg, VT);
 650     }
 651
 652     case AMDGPUIntrinsic::R600_interp_input: {
 653       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 654       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 655       MachineSDNode *interp;
 656       if (ijb < 0) {
 657         const R600InstrInfo *TII =
 658             static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 659         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 660             MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
 661         return DAG.getTargetExtractSubreg(
 662             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 663             DL, MVT::f32, SDValue(interp, 0));
 664       }
 665       MachineFunction &MF = DAG.getMachineFunction();
 666       MachineRegisterInfo &MRI = MF.getRegInfo();
 667       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 668       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 669       MRI.addLiveIn(RegisterI);
 670       MRI.addLiveIn(RegisterJ);
 671       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 672           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 673       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 674           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 675
 676       if (slot % 4 < 2)
 677         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 678             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
 679             RegisterJNode, RegisterINode);
 680       else
 681         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 682             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
 683             RegisterJNode, RegisterINode);
 684       return SDValue(interp, slot % 2);
 685     }
 686     case AMDGPUIntrinsic::R600_interp_xy:
 687     case AMDGPUIntrinsic::R600_interp_zw: {
 688       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 689       MachineSDNode *interp;
 690       SDValue RegisterINode = Op.getOperand(2);
 691       SDValue RegisterJNode = Op.getOperand(3);
 692
 693       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 694         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 695             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
 696             RegisterJNode, RegisterINode);
 697       else
 698         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 699             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
 700             RegisterJNode, RegisterINode);
 701       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 702           SDValue(interp, 0), SDValue(interp, 1));
 703     }
 704     case AMDGPUIntrinsic::R600_tex:
 705     case AMDGPUIntrinsic::R600_texc:
 706     case AMDGPUIntrinsic::R600_txl:
 707     case AMDGPUIntrinsic::R600_txlc:
 708     case AMDGPUIntrinsic::R600_txb:
 709     case AMDGPUIntrinsic::R600_txbc:
 710     case AMDGPUIntrinsic::R600_txf:
 711     case AMDGPUIntrinsic::R600_txq:
 712     case AMDGPUIntrinsic::R600_ddx:
 713     case AMDGPUIntrinsic::R600_ddy:
 714     case AMDGPUIntrinsic::R600_ldptr: {
 715       unsigned TextureOp;
 716       switch (IntrinsicID) {
 717       case AMDGPUIntrinsic::R600_tex:
 718         TextureOp = 0;
 719         break;
 720       case AMDGPUIntrinsic::R600_texc:
 721         TextureOp = 1;
 722         break;
 723       case AMDGPUIntrinsic::R600_txl:
 724         TextureOp = 2;
 725         break;
 726       case AMDGPUIntrinsic::R600_txlc:
 727         TextureOp = 3;
 728         break;
 729       case AMDGPUIntrinsic::R600_txb:
 730         TextureOp = 4;
 731         break;
 732       case AMDGPUIntrinsic::R600_txbc:
 733         TextureOp = 5;
 734         break;
 735       case AMDGPUIntrinsic::R600_txf:
 736         TextureOp = 6;
 737         break;
 738       case AMDGPUIntrinsic::R600_txq:
 739         TextureOp = 7;
 740         break;
 741       case AMDGPUIntrinsic::R600_ddx:
 742         TextureOp = 8;
 743         break;
 744       case AMDGPUIntrinsic::R600_ddy:
 745         TextureOp = 9;
 746         break;
 747       case AMDGPUIntrinsic::R600_ldptr:
 748         TextureOp = 10;
 749         break;
 750       default:
 751         llvm_unreachable("Unknow Texture Operation");
 752       }
 753
 754       SDValue TexArgs[19] = {
 755         DAG.getConstant(TextureOp, DL, MVT::i32),
 756         Op.getOperand(1),
 757         DAG.getConstant(0, DL, MVT::i32),
 758         DAG.getConstant(1, DL, MVT::i32),
 759         DAG.getConstant(2, DL, MVT::i32),
 760         DAG.getConstant(3, DL, MVT::i32),
 761         Op.getOperand(2),
 762         Op.getOperand(3),
 763         Op.getOperand(4),
 764         DAG.getConstant(0, DL, MVT::i32),
 765         DAG.getConstant(1, DL, MVT::i32),
 766         DAG.getConstant(2, DL, MVT::i32),
 767         DAG.getConstant(3, DL, MVT::i32),
 768         Op.getOperand(5),
 769         Op.getOperand(6),
 770         Op.getOperand(7),
 771         Op.getOperand(8),
 772         Op.getOperand(9),
 773         Op.getOperand(10)
 774       };
 775       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 776     }
 777     case AMDGPUIntrinsic::AMDGPU_dp4: {
 778       SDValue Args[8] = {
 779       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 780           DAG.getConstant(0, DL, MVT::i32)),
 781       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 782           DAG.getConstant(0, DL, MVT::i32)),
 783       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 784           DAG.getConstant(1, DL, MVT::i32)),
 785       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 786           DAG.getConstant(1, DL, MVT::i32)),
 787       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 788           DAG.getConstant(2, DL, MVT::i32)),
 789       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 790           DAG.getConstant(2, DL, MVT::i32)),
 791       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 792           DAG.getConstant(3, DL, MVT::i32)),
 793       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 794           DAG.getConstant(3, DL, MVT::i32))
 795       };
 796       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 797     }
 798
 799     case Intrinsic::r600_read_ngroups_x:
 800       return LowerImplicitParameter(DAG, VT, DL, 0);
 801     case Intrinsic::r600_read_ngroups_y:
 802       return LowerImplicitParameter(DAG, VT, DL, 1);
 803     case Intrinsic::r600_read_ngroups_z:
 804       return LowerImplicitParameter(DAG, VT, DL, 2);
 805     case Intrinsic::r600_read_global_size_x:
 806       return LowerImplicitParameter(DAG, VT, DL, 3);
 807     case Intrinsic::r600_read_global_size_y:
 808       return LowerImplicitParameter(DAG, VT, DL, 4);
 809     case Intrinsic::r600_read_global_size_z:
 810       return LowerImplicitParameter(DAG, VT, DL, 5);
 811     case Intrinsic::r600_read_local_size_x:
 812       return LowerImplicitParameter(DAG, VT, DL, 6);
 813     case Intrinsic::r600_read_local_size_y:
 814       return LowerImplicitParameter(DAG, VT, DL, 7);
 815     case Intrinsic::r600_read_local_size_z:
 816       return LowerImplicitParameter(DAG, VT, DL, 8);
 817
 818     case Intrinsic::AMDGPU_read_workdim:
 819       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
 820
 821     case Intrinsic::r600_read_tgid_x:
 822       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 823                                   AMDGPU::T1_X, VT);
 824     case Intrinsic::r600_read_tgid_y:
 825       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 826                                   AMDGPU::T1_Y, VT);
 827     case Intrinsic::r600_read_tgid_z:
 828       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 829                                   AMDGPU::T1_Z, VT);
 830     case Intrinsic::r600_read_tidig_x:
 831       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 832                                   AMDGPU::T0_X, VT);
 833     case Intrinsic::r600_read_tidig_y:
 834       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 835                                   AMDGPU::T0_Y, VT);
 836     case Intrinsic::r600_read_tidig_z:
 837       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 838                                   AMDGPU::T0_Z, VT);
 839     case Intrinsic::AMDGPU_rsq:
 840       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 841       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 842
 843     case AMDGPUIntrinsic::AMDGPU_fract:
 844     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
 845       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
 846     }
 847     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 848     break;
 849   }
 850   } // end switch(Op.getOpcode())
 851   return SDValue();
 852 }
 853
 854 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 855                                             SmallVectorImpl<SDValue> &Results,
 856                                             SelectionDAG &DAG) const {
 857   switch (N->getOpcode()) {
 858   default:
 859     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 860     return;
 861   case ISD::FP_TO_UINT:
 862     if (N->getValueType(0) == MVT::i1) {
 863       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 864       return;
 865     }
 866     // Fall-through. Since we don't care about out of bounds values
 867     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 868     // considers some extra cases which are not necessary here.
 869   case ISD::FP_TO_SINT: {
 870     SDValue Result;
 871     if (expandFP_TO_SINT(N, Result, DAG))
 872       Results.push_back(Result);
 873     return;
 874   }
 875   case ISD::SDIVREM: {
 876     SDValue Op = SDValue(N, 1);
 877     SDValue RES = LowerSDIVREM(Op, DAG);
 878     Results.push_back(RES);
 879     Results.push_back(RES.getValue(1));
 880     break;
 881   }
 882   case ISD::UDIVREM: {
 883     SDValue Op = SDValue(N, 0);
 884     LowerUDIVREM64(Op, DAG, Results);
 885     break;
 886   }
 887   }
 888 }
 889
 890 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 891                                                    SDValue Vector) const {
 892
 893   SDLoc DL(Vector);
 894   EVT VecVT = Vector.getValueType();
 895   EVT EltVT = VecVT.getVectorElementType();
 896   SmallVector<SDValue, 8> Args;
 897
 898   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 899                                                            i != e; ++i) {
 900     Args.push_back(DAG.getNode(
 901         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
 902         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
 903   }
 904
 905   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 906 }
 907
 908 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 909                                                     SelectionDAG &DAG) const {
 910
 911   SDLoc DL(Op);
 912   SDValue Vector = Op.getOperand(0);
 913   SDValue Index = Op.getOperand(1);
 914
 915   if (isa<ConstantSDNode>(Index) ||
 916       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 917     return Op;
 918
 919   Vector = vectorToVerticalVector(DAG, Vector);
 920   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 921                      Vector, Index);
 922 }
 923
 924 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 925                                                    SelectionDAG &DAG) const {
 926   SDLoc DL(Op);
 927   SDValue Vector = Op.getOperand(0);
 928   SDValue Value = Op.getOperand(1);
 929   SDValue Index = Op.getOperand(2);
 930
 931   if (isa<ConstantSDNode>(Index) ||
 932       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 933     return Op;
 934
 935   Vector = vectorToVerticalVector(DAG, Vector);
 936   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 937                                Vector, Value, Index);
 938   return vectorToVerticalVector(DAG, Insert);
 939 }
 940
 941 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 942   // On hw >= R700, COS/SIN input must be between -1. and 1.
 943   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 944   EVT VT = Op.getValueType();
 945   SDValue Arg = Op.getOperand(0);
 946   SDLoc DL(Op);
 947   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
 948       DAG.getNode(ISD::FADD, DL, VT,
 949         DAG.getNode(ISD::FMUL, DL, VT, Arg,
 950           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
 951         DAG.getConstantFP(0.5, DL, MVT::f32)));
 952   unsigned TrigNode;
 953   switch (Op.getOpcode()) {
 954   case ISD::FCOS:
 955     TrigNode = AMDGPUISD::COS_HW;
 956     break;
 957   case ISD::FSIN:
 958     TrigNode = AMDGPUISD::SIN_HW;
 959     break;
 960   default:
 961     llvm_unreachable("Wrong trig opcode");
 962   }
 963   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
 964       DAG.getNode(ISD::FADD, DL, VT, FractPart,
 965         DAG.getConstantFP(-0.5, DL, MVT::f32)));
 966   if (Gen >= AMDGPUSubtarget::R700)
 967     return TrigVal;
 968   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 969   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
 970       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
 971 }
 972
 973 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
 974   SDLoc DL(Op);
 975   EVT VT = Op.getValueType();
 976
 977   SDValue Lo = Op.getOperand(0);
 978   SDValue Hi = Op.getOperand(1);
 979   SDValue Shift = Op.getOperand(2);
 980   SDValue Zero = DAG.getConstant(0, DL, VT);
 981   SDValue One  = DAG.getConstant(1, DL, VT);
 982
 983   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
 984   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
 985   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
 986   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
 987
 988   // The dance around Width1 is necessary for 0 special case.
 989   // Without it the CompShift might be 32, producing incorrect results in
 990   // Overflow. So we do the shift in two steps, the alternative is to
 991   // add a conditional to filter the special case.
 992
 993   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
 994   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
 995
 996   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
 997   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
 998   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
 999
1000   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1001   SDValue LoBig = Zero;
1002
1003   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1004   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1005
1006   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1007 }
1008
1009 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1010   SDLoc DL(Op);
1011   EVT VT = Op.getValueType();
1012
1013   SDValue Lo = Op.getOperand(0);
1014   SDValue Hi = Op.getOperand(1);
1015   SDValue Shift = Op.getOperand(2);
1016   SDValue Zero = DAG.getConstant(0, DL, VT);
1017   SDValue One  = DAG.getConstant(1, DL, VT);
1018
1019   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1020
1021   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1022   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1023   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1024   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1025
1026   // The dance around Width1 is necessary for 0 special case.
1027   // Without it the CompShift might be 32, producing incorrect results in
1028   // Overflow. So we do the shift in two steps, the alternative is to
1029   // add a conditional to filter the special case.
1030
1031   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1032   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1033
1034   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1035   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1036   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1037
1038   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1039   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1040
1041   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1042   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1043
1044   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1045 }
1046
1047 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1048                                           unsigned mainop, unsigned ovf) const {
1049   SDLoc DL(Op);
1050   EVT VT = Op.getValueType();
1051
1052   SDValue Lo = Op.getOperand(0);
1053   SDValue Hi = Op.getOperand(1);
1054
1055   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1056   // Extend sign.
1057   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1058                     DAG.getValueType(MVT::i1));
1059
1060   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1061
1062   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1063 }
1064
1065 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1066   SDLoc DL(Op);
1067   return DAG.getNode(
1068       ISD::SETCC,
1069       DL,
1070       MVT::i1,
1071       Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
1072       DAG.getCondCode(ISD::SETNE)
1073       );
1074 }
1075
1076 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1077                                                    SDLoc DL,
1078                                                    unsigned DwordOffset) const {
1079   unsigned ByteOffset = DwordOffset * 4;
1080   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1081                                       AMDGPUAS::CONSTANT_BUFFER_0);
1082
1083   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1084   assert(isInt<16>(ByteOffset));
1085
1086   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1087                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1088                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1089                      false, false, false, 0);
1090 }
1091
1092 bool R600TargetLowering::isZero(SDValue Op) const {
1093   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1094     return Cst->isNullValue();
1095   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1096     return CstFP->isZero();
1097   } else {
1098     return false;
1099   }
1100 }
1101
1102 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1103   SDLoc DL(Op);
1104   EVT VT = Op.getValueType();
1105
1106   SDValue LHS = Op.getOperand(0);
1107   SDValue RHS = Op.getOperand(1);
1108   SDValue True = Op.getOperand(2);
1109   SDValue False = Op.getOperand(3);
1110   SDValue CC = Op.getOperand(4);
1111   SDValue Temp;
1112
1113   if (VT == MVT::f32) {
1114     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1115     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1116     if (MinMax)
1117       return MinMax;
1118   }
1119
1120   // LHS and RHS are guaranteed to be the same value type
1121   EVT CompareVT = LHS.getValueType();
1122
1123   // Check if we can lower this to a native operation.
1124
1125   // Try to lower to a SET* instruction:
1126   //
1127   // SET* can match the following patterns:
1128   //
1129   // select_cc f32, f32, -1,  0, cc_supported
1130   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1131   // select_cc i32, i32, -1,  0, cc_supported
1132   //
1133
1134   // Move hardware True/False values to the correct operand.
1135   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1136   ISD::CondCode InverseCC =
1137      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1138   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1139     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1140       std::swap(False, True);
1141       CC = DAG.getCondCode(InverseCC);
1142     } else {
1143       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1144       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1145         std::swap(False, True);
1146         std::swap(LHS, RHS);
1147         CC = DAG.getCondCode(SwapInvCC);
1148       }
1149     }
1150   }
1151
1152   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1153       (CompareVT == VT || VT == MVT::i32)) {
1154     // This can be matched by a SET* instruction.
1155     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1156   }
1157
1158   // Try to lower to a CND* instruction:
1159   //
1160   // CND* can match the following patterns:
1161   //
1162   // select_cc f32, 0.0, f32, f32, cc_supported
1163   // select_cc f32, 0.0, i32, i32, cc_supported
1164   // select_cc i32, 0,   f32, f32, cc_supported
1165   // select_cc i32, 0,   i32, i32, cc_supported
1166   //
1167
1168   // Try to move the zero value to the RHS
1169   if (isZero(LHS)) {
1170     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1171     // Try swapping the operands
1172     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1173     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1174       std::swap(LHS, RHS);
1175       CC = DAG.getCondCode(CCSwapped);
1176     } else {
1177       // Try inverting the conditon and then swapping the operands
1178       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1179       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1180       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1181         std::swap(True, False);
1182         std::swap(LHS, RHS);
1183         CC = DAG.getCondCode(CCSwapped);
1184       }
1185     }
1186   }
1187   if (isZero(RHS)) {
1188     SDValue Cond = LHS;
1189     SDValue Zero = RHS;
1190     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1191     if (CompareVT != VT) {
1192       // Bitcast True / False to the correct types.  This will end up being
1193       // a nop, but it allows us to define only a single pattern in the
1194       // .TD files for each CND* instruction rather than having to have
1195       // one pattern for integer True/False and one for fp True/False
1196       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1197       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1198     }
1199
1200     switch (CCOpcode) {
1201     case ISD::SETONE:
1202     case ISD::SETUNE:
1203     case ISD::SETNE:
1204       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1205       Temp = True;
1206       True = False;
1207       False = Temp;
1208       break;
1209     default:
1210       break;
1211     }
1212     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1213         Cond, Zero,
1214         True, False,
1215         DAG.getCondCode(CCOpcode));
1216     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1217   }
1218
1219   // If we make it this for it means we have no native instructions to handle
1220   // this SELECT_CC, so we must lower it.
1221   SDValue HWTrue, HWFalse;
1222
1223   if (CompareVT == MVT::f32) {
1224     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1225     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1226   } else if (CompareVT == MVT::i32) {
1227     HWTrue = DAG.getConstant(-1, DL, CompareVT);
1228     HWFalse = DAG.getConstant(0, DL, CompareVT);
1229   }
1230   else {
1231     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1232   }
1233
1234   // Lower this unsupported SELECT_CC into a combination of two supported
1235   // SELECT_CC operations.
1236   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1237
1238   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1239       Cond, HWFalse,
1240       True, False,
1241       DAG.getCondCode(ISD::SETNE));
1242 }
1243
1244 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1245 /// convert these pointers to a register index.  Each register holds
1246 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1247 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1248 /// for indirect addressing.
1249 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1250                                                unsigned StackWidth,
1251                                                SelectionDAG &DAG) const {
1252   unsigned SRLPad;
1253   switch(StackWidth) {
1254   case 1:
1255     SRLPad = 2;
1256     break;
1257   case 2:
1258     SRLPad = 3;
1259     break;
1260   case 4:
1261     SRLPad = 4;
1262     break;
1263   default: llvm_unreachable("Invalid stack width");
1264   }
1265
1266   SDLoc DL(Ptr);
1267   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1268                      DAG.getConstant(SRLPad, DL, MVT::i32));
1269 }
1270
1271 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1272                                          unsigned ElemIdx,
1273                                          unsigned &Channel,
1274                                          unsigned &PtrIncr) const {
1275   switch (StackWidth) {
1276   default:
1277   case 1:
1278     Channel = 0;
1279     if (ElemIdx > 0) {
1280       PtrIncr = 1;
1281     } else {
1282       PtrIncr = 0;
1283     }
1284     break;
1285   case 2:
1286     Channel = ElemIdx % 2;
1287     if (ElemIdx == 2) {
1288       PtrIncr = 1;
1289     } else {
1290       PtrIncr = 0;
1291     }
1292     break;
1293   case 4:
1294     Channel = ElemIdx;
1295     PtrIncr = 0;
1296     break;
1297   }
1298 }
1299
1300 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1301   SDLoc DL(Op);
1302   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1303   SDValue Chain = Op.getOperand(0);
1304   SDValue Value = Op.getOperand(1);
1305   SDValue Ptr = Op.getOperand(2);
1306
1307   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1308   if (Result.getNode()) {
1309     return Result;
1310   }
1311
1312   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1313     if (StoreNode->isTruncatingStore()) {
1314       EVT VT = Value.getValueType();
1315       assert(VT.bitsLE(MVT::i32));
1316       EVT MemVT = StoreNode->getMemoryVT();
1317       SDValue MaskConstant;
1318       if (MemVT == MVT::i8) {
1319         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1320       } else {
1321         assert(MemVT == MVT::i16);
1322         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1323       }
1324       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1325                                       DAG.getConstant(2, DL, MVT::i32));
1326       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1327                                       DAG.getConstant(0x00000003, DL, VT));
1328       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1329       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1330                                    DAG.getConstant(3, DL, VT));
1331       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1332       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1333       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1334       // vector instead.
1335       SDValue Src[4] = {
1336         ShiftedValue,
1337         DAG.getConstant(0, DL, MVT::i32),
1338         DAG.getConstant(0, DL, MVT::i32),
1339         Mask
1340       };
1341       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1342       SDValue Args[3] = { Chain, Input, DWordAddr };
1343       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1344                                      Op->getVTList(), Args, MemVT,
1345                                      StoreNode->getMemOperand());
1346     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1347                Value.getValueType().bitsGE(MVT::i32)) {
1348       // Convert pointer from byte address to dword address.
1349       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1350                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1351                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
1352
1353       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1354         llvm_unreachable("Truncated and indexed stores not supported yet");
1355       } else {
1356         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1357       }
1358       return Chain;
1359     }
1360   }
1361
1362   EVT ValueVT = Value.getValueType();
1363
1364   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1365     return SDValue();
1366   }
1367
1368   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1369   if (Ret.getNode()) {
1370     return Ret;
1371   }
1372   // Lowering for indirect addressing
1373
1374   const MachineFunction &MF = DAG.getMachineFunction();
1375   const AMDGPUFrameLowering *TFL =
1376       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1377   unsigned StackWidth = TFL->getStackWidth(MF);
1378
1379   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1380
1381   if (ValueVT.isVector()) {
1382     unsigned NumElemVT = ValueVT.getVectorNumElements();
1383     EVT ElemVT = ValueVT.getVectorElementType();
1384     SmallVector<SDValue, 4> Stores(NumElemVT);
1385
1386     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1387                                       "vector width in load");
1388
1389     for (unsigned i = 0; i < NumElemVT; ++i) {
1390       unsigned Channel, PtrIncr;
1391       getStackAddress(StackWidth, i, Channel, PtrIncr);
1392       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1393                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1394       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1395                                  Value, DAG.getConstant(i, DL, MVT::i32));
1396
1397       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1398                               Chain, Elem, Ptr,
1399                               DAG.getTargetConstant(Channel, DL, MVT::i32));
1400     }
1401      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1402    } else {
1403     if (ValueVT == MVT::i8) {
1404       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1405     }
1406     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1407     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1408   }
1409
1410   return Chain;
1411 }
1412
1413 // return (512 + (kc_bank << 12)
1414 static int
1415 ConstantAddressBlock(unsigned AddressSpace) {
1416   switch (AddressSpace) {
1417   case AMDGPUAS::CONSTANT_BUFFER_0:
1418     return 512;
1419   case AMDGPUAS::CONSTANT_BUFFER_1:
1420     return 512 + 4096;
1421   case AMDGPUAS::CONSTANT_BUFFER_2:
1422     return 512 + 4096 * 2;
1423   case AMDGPUAS::CONSTANT_BUFFER_3:
1424     return 512 + 4096 * 3;
1425   case AMDGPUAS::CONSTANT_BUFFER_4:
1426     return 512 + 4096 * 4;
1427   case AMDGPUAS::CONSTANT_BUFFER_5:
1428     return 512 + 4096 * 5;
1429   case AMDGPUAS::CONSTANT_BUFFER_6:
1430     return 512 + 4096 * 6;
1431   case AMDGPUAS::CONSTANT_BUFFER_7:
1432     return 512 + 4096 * 7;
1433   case AMDGPUAS::CONSTANT_BUFFER_8:
1434     return 512 + 4096 * 8;
1435   case AMDGPUAS::CONSTANT_BUFFER_9:
1436     return 512 + 4096 * 9;
1437   case AMDGPUAS::CONSTANT_BUFFER_10:
1438     return 512 + 4096 * 10;
1439   case AMDGPUAS::CONSTANT_BUFFER_11:
1440     return 512 + 4096 * 11;
1441   case AMDGPUAS::CONSTANT_BUFFER_12:
1442     return 512 + 4096 * 12;
1443   case AMDGPUAS::CONSTANT_BUFFER_13:
1444     return 512 + 4096 * 13;
1445   case AMDGPUAS::CONSTANT_BUFFER_14:
1446     return 512 + 4096 * 14;
1447   case AMDGPUAS::CONSTANT_BUFFER_15:
1448     return 512 + 4096 * 15;
1449   default:
1450     return -1;
1451   }
1452 }
1453
1454 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1455 {
1456   EVT VT = Op.getValueType();
1457   SDLoc DL(Op);
1458   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1459   SDValue Chain = Op.getOperand(0);
1460   SDValue Ptr = Op.getOperand(1);
1461   SDValue LoweredLoad;
1462
1463   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1464   if (Ret.getNode()) {
1465     SDValue Ops[2] = {
1466       Ret,
1467       Chain
1468     };
1469     return DAG.getMergeValues(Ops, DL);
1470   }
1471
1472   // Lower loads constant address space global variable loads
1473   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1474       isa<GlobalVariable>(GetUnderlyingObject(
1475           LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) {
1476
1477     SDValue Ptr = DAG.getZExtOrTrunc(
1478         LoadNode->getBasePtr(), DL,
1479         getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS));
1480     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1481         DAG.getConstant(2, DL, MVT::i32));
1482     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1483                        LoadNode->getChain(), Ptr,
1484                        DAG.getTargetConstant(0, DL, MVT::i32),
1485                        Op.getOperand(2));
1486   }
1487
1488   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1489     SDValue MergedValues[2] = {
1490       ScalarizeVectorLoad(Op, DAG),
1491       Chain
1492     };
1493     return DAG.getMergeValues(MergedValues, DL);
1494   }
1495
1496   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1497   if (ConstantBlock > -1 &&
1498       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1499        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1500     SDValue Result;
1501     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1502         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1503         isa<ConstantSDNode>(Ptr)) {
1504       SDValue Slots[4];
1505       for (unsigned i = 0; i < 4; i++) {
1506         // We want Const position encoded with the following formula :
1507         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1508         // const_index is Ptr computed by llvm using an alignment of 16.
1509         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1510         // then div by 4 at the ISel step
1511         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1512             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1513         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1514       }
1515       EVT NewVT = MVT::v4i32;
1516       unsigned NumElements = 4;
1517       if (VT.isVector()) {
1518         NewVT = VT;
1519         NumElements = VT.getVectorNumElements();
1520       }
1521       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1522                            makeArrayRef(Slots, NumElements));
1523     } else {
1524       // non-constant ptr can't be folded, keeps it as a v4f32 load
1525       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1526           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1527                       DAG.getConstant(4, DL, MVT::i32)),
1528                       DAG.getConstant(LoadNode->getAddressSpace() -
1529                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1530           );
1531     }
1532
1533     if (!VT.isVector()) {
1534       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1535                            DAG.getConstant(0, DL, MVT::i32));
1536     }
1537
1538     SDValue MergedValues[2] = {
1539       Result,
1540       Chain
1541     };
1542     return DAG.getMergeValues(MergedValues, DL);
1543   }
1544
1545   // For most operations returning SDValue() will result in the node being
1546   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1547   // need to manually expand loads that may be legal in some address spaces and
1548   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1549   // compute shaders, since the data is sign extended when it is uploaded to the
1550   // buffer. However SEXT loads from other address spaces are not supported, so
1551   // we need to expand them here.
1552   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1553     EVT MemVT = LoadNode->getMemoryVT();
1554     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1555     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1556                                   LoadNode->getPointerInfo(), MemVT,
1557                                   LoadNode->isVolatile(),
1558                                   LoadNode->isNonTemporal(),
1559                                   LoadNode->isInvariant(),
1560                                   LoadNode->getAlignment());
1561     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1562                               DAG.getValueType(MemVT));
1563
1564     SDValue MergedValues[2] = { Res, Chain };
1565     return DAG.getMergeValues(MergedValues, DL);
1566   }
1567
1568   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1569     return SDValue();
1570   }
1571
1572   // Lowering for indirect addressing
1573   const MachineFunction &MF = DAG.getMachineFunction();
1574   const AMDGPUFrameLowering *TFL =
1575       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1576   unsigned StackWidth = TFL->getStackWidth(MF);
1577
1578   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1579
1580   if (VT.isVector()) {
1581     unsigned NumElemVT = VT.getVectorNumElements();
1582     EVT ElemVT = VT.getVectorElementType();
1583     SDValue Loads[4];
1584
1585     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1586                                       "vector width in load");
1587
1588     for (unsigned i = 0; i < NumElemVT; ++i) {
1589       unsigned Channel, PtrIncr;
1590       getStackAddress(StackWidth, i, Channel, PtrIncr);
1591       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1592                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1593       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1594                              Chain, Ptr,
1595                              DAG.getTargetConstant(Channel, DL, MVT::i32),
1596                              Op.getOperand(2));
1597     }
1598     for (unsigned i = NumElemVT; i < 4; ++i) {
1599       Loads[i] = DAG.getUNDEF(ElemVT);
1600     }
1601     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1602     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1603   } else {
1604     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1605                               Chain, Ptr,
1606                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1607                               Op.getOperand(2));
1608   }
1609
1610   SDValue Ops[2] = {
1611     LoweredLoad,
1612     Chain
1613   };
1614
1615   return DAG.getMergeValues(Ops, DL);
1616 }
1617
1618 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1619   SDValue Chain = Op.getOperand(0);
1620   SDValue Cond  = Op.getOperand(1);
1621   SDValue Jump  = Op.getOperand(2);
1622
1623   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1624                      Chain, Jump, Cond);
1625 }
1626
1627 /// XXX Only kernel functions are supported, so we can assume for now that
1628 /// every function is a kernel function, but in the future we should use
1629 /// separate calling conventions for kernel and non-kernel functions.
1630 SDValue R600TargetLowering::LowerFormalArguments(
1631                                       SDValue Chain,
1632                                       CallingConv::ID CallConv,
1633                                       bool isVarArg,
1634                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1635                                       SDLoc DL, SelectionDAG &DAG,
1636                                       SmallVectorImpl<SDValue> &InVals) const {
1637   SmallVector<CCValAssign, 16> ArgLocs;
1638   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1639                  *DAG.getContext());
1640   MachineFunction &MF = DAG.getMachineFunction();
1641   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1642
1643   SmallVector<ISD::InputArg, 8> LocalIns;
1644
1645   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1646
1647   AnalyzeFormalArguments(CCInfo, LocalIns);
1648
1649   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1650     CCValAssign &VA = ArgLocs[i];
1651     const ISD::InputArg &In = Ins[i];
1652     EVT VT = In.VT;
1653     EVT MemVT = VA.getLocVT();
1654     if (!VT.isVector() && MemVT.isVector()) {
1655       // Get load source type if scalarized.
1656       MemVT = MemVT.getVectorElementType();
1657     }
1658
1659     if (MFI->getShaderType() != ShaderType::COMPUTE) {
1660       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1661       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1662       InVals.push_back(Register);
1663       continue;
1664     }
1665
1666     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1667                                           AMDGPUAS::CONSTANT_BUFFER_0);
1668
1669     // i64 isn't a legal type, so the register type used ends up as i32, which
1670     // isn't expected here. It attempts to create this sextload, but it ends up
1671     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1672     // for <1 x i64>.
1673
1674     // The first 36 bytes of the input buffer contains information about
1675     // thread group and global sizes.
1676     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1677     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1678       // FIXME: This should really check the extload type, but the handling of
1679       // extload vector parameters seems to be broken.
1680
1681       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1682       Ext = ISD::SEXTLOAD;
1683     }
1684
1685     // Compute the offset from the value.
1686     // XXX - I think PartOffset should give you this, but it seems to give the
1687     // size of the register which isn't useful.
1688
1689     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1690     unsigned PartOffset = VA.getLocMemOffset();
1691     unsigned Offset = 36 + VA.getLocMemOffset();
1692
1693     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1694     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1695                               DAG.getConstant(Offset, DL, MVT::i32),
1696                               DAG.getUNDEF(MVT::i32),
1697                               PtrInfo,
1698                               MemVT, false, true, true, 4);
1699
1700     // 4 is the preferred alignment for the CONSTANT memory space.
1701     InVals.push_back(Arg);
1702     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1703   }
1704   return Chain;
1705 }
1706
1707 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1708                                            EVT VT) const {
1709    if (!VT.isVector())
1710      return MVT::i32;
1711    return VT.changeVectorElementTypeToInteger();
1712 }
1713
1714 static SDValue CompactSwizzlableVector(
1715   SelectionDAG &DAG, SDValue VectorEntry,
1716   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1717   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1718   assert(RemapSwizzle.empty());
1719   SDValue NewBldVec[4] = {
1720     VectorEntry.getOperand(0),
1721     VectorEntry.getOperand(1),
1722     VectorEntry.getOperand(2),
1723     VectorEntry.getOperand(3)
1724   };
1725
1726   for (unsigned i = 0; i < 4; i++) {
1727     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1728       // We mask write here to teach later passes that the ith element of this
1729       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1730       // break false dependencies and additionnaly make assembly easier to read.
1731       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1732     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1733       if (C->isZero()) {
1734         RemapSwizzle[i] = 4; // SEL_0
1735         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1736       } else if (C->isExactlyValue(1.0)) {
1737         RemapSwizzle[i] = 5; // SEL_1
1738         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1739       }
1740     }
1741
1742     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1743       continue;
1744     for (unsigned j = 0; j < i; j++) {
1745       if (NewBldVec[i] == NewBldVec[j]) {
1746         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1747         RemapSwizzle[i] = j;
1748         break;
1749       }
1750     }
1751   }
1752
1753   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1754                      VectorEntry.getValueType(), NewBldVec);
1755 }
1756
1757 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1758                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1759   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1760   assert(RemapSwizzle.empty());
1761   SDValue NewBldVec[4] = {
1762       VectorEntry.getOperand(0),
1763       VectorEntry.getOperand(1),
1764       VectorEntry.getOperand(2),
1765       VectorEntry.getOperand(3)
1766   };
1767   bool isUnmovable[4] = { false, false, false, false };
1768   for (unsigned i = 0; i < 4; i++) {
1769     RemapSwizzle[i] = i;
1770     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1771       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1772           ->getZExtValue();
1773       if (i == Idx)
1774         isUnmovable[Idx] = true;
1775     }
1776   }
1777
1778   for (unsigned i = 0; i < 4; i++) {
1779     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1780       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1781           ->getZExtValue();
1782       if (isUnmovable[Idx])
1783         continue;
1784       // Swap i and Idx
1785       std::swap(NewBldVec[Idx], NewBldVec[i]);
1786       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1787       break;
1788     }
1789   }
1790
1791   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1792                      VectorEntry.getValueType(), NewBldVec);
1793 }
1794
1795
1796 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1797                                             SDValue Swz[4], SelectionDAG &DAG,
1798                                             SDLoc DL) const {
1799   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1800   // Old -> New swizzle values
1801   DenseMap<unsigned, unsigned> SwizzleRemap;
1802
1803   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1804   for (unsigned i = 0; i < 4; i++) {
1805     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1806     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1807       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1808   }
1809
1810   SwizzleRemap.clear();
1811   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1812   for (unsigned i = 0; i < 4; i++) {
1813     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1814     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1815       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1816   }
1817
1818   return BuildVector;
1819 }
1820
1821
1822 //===----------------------------------------------------------------------===//
1823 // Custom DAG Optimizations
1824 //===----------------------------------------------------------------------===//
1825
1826 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1827                                               DAGCombinerInfo &DCI) const {
1828   SelectionDAG &DAG = DCI.DAG;
1829
1830   switch (N->getOpcode()) {
1831   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1832   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1833   case ISD::FP_ROUND: {
1834       SDValue Arg = N->getOperand(0);
1835       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1836         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1837                            Arg.getOperand(0));
1838       }
1839       break;
1840     }
1841
1842   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1843   // (i32 select_cc f32, f32, -1, 0 cc)
1844   //
1845   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1846   // this to one of the SET*_DX10 instructions.
1847   case ISD::FP_TO_SINT: {
1848     SDValue FNeg = N->getOperand(0);
1849     if (FNeg.getOpcode() != ISD::FNEG) {
1850       return SDValue();
1851     }
1852     SDValue SelectCC = FNeg.getOperand(0);
1853     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1854         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1855         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1856         !isHWTrueValue(SelectCC.getOperand(2)) ||
1857         !isHWFalseValue(SelectCC.getOperand(3))) {
1858       return SDValue();
1859     }
1860
1861     SDLoc dl(N);
1862     return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1863                            SelectCC.getOperand(0), // LHS
1864                            SelectCC.getOperand(1), // RHS
1865                            DAG.getConstant(-1, dl, MVT::i32), // True
1866                            DAG.getConstant(0, dl, MVT::i32),  // False
1867                            SelectCC.getOperand(4)); // CC
1868
1869     break;
1870   }
1871
1872   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1873   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1874   case ISD::INSERT_VECTOR_ELT: {
1875     SDValue InVec = N->getOperand(0);
1876     SDValue InVal = N->getOperand(1);
1877     SDValue EltNo = N->getOperand(2);
1878     SDLoc dl(N);
1879
1880     // If the inserted element is an UNDEF, just use the input vector.
1881     if (InVal.getOpcode() == ISD::UNDEF)
1882       return InVec;
1883
1884     EVT VT = InVec.getValueType();
1885
1886     // If we can't generate a legal BUILD_VECTOR, exit
1887     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1888       return SDValue();
1889
1890     // Check that we know which element is being inserted
1891     if (!isa<ConstantSDNode>(EltNo))
1892       return SDValue();
1893     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1894
1895     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1896     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1897     // vector elements.
1898     SmallVector<SDValue, 8> Ops;
1899     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1900       Ops.append(InVec.getNode()->op_begin(),
1901                  InVec.getNode()->op_end());
1902     } else if (InVec.getOpcode() == ISD::UNDEF) {
1903       unsigned NElts = VT.getVectorNumElements();
1904       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1905     } else {
1906       return SDValue();
1907     }
1908
1909     // Insert the element
1910     if (Elt < Ops.size()) {
1911       // All the operands of BUILD_VECTOR must have the same type;
1912       // we enforce that here.
1913       EVT OpVT = Ops[0].getValueType();
1914       if (InVal.getValueType() != OpVT)
1915         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1916           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1917           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1918       Ops[Elt] = InVal;
1919     }
1920
1921     // Return the new vector
1922     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1923   }
1924
1925   // Extract_vec (Build_vector) generated by custom lowering
1926   // also needs to be customly combined
1927   case ISD::EXTRACT_VECTOR_ELT: {
1928     SDValue Arg = N->getOperand(0);
1929     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1930       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1931         unsigned Element = Const->getZExtValue();
1932         return Arg->getOperand(Element);
1933       }
1934     }
1935     if (Arg.getOpcode() == ISD::BITCAST &&
1936         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1937       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1938         unsigned Element = Const->getZExtValue();
1939         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1940             Arg->getOperand(0).getOperand(Element));
1941       }
1942     }
1943   }
1944
1945   case ISD::SELECT_CC: {
1946     // Try common optimizations
1947     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1948     if (Ret.getNode())
1949       return Ret;
1950
1951     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1952     //      selectcc x, y, a, b, inv(cc)
1953     //
1954     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1955     //      selectcc x, y, a, b, cc
1956     SDValue LHS = N->getOperand(0);
1957     if (LHS.getOpcode() != ISD::SELECT_CC) {
1958       return SDValue();
1959     }
1960
1961     SDValue RHS = N->getOperand(1);
1962     SDValue True = N->getOperand(2);
1963     SDValue False = N->getOperand(3);
1964     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1965
1966     if (LHS.getOperand(2).getNode() != True.getNode() ||
1967         LHS.getOperand(3).getNode() != False.getNode() ||
1968         RHS.getNode() != False.getNode()) {
1969       return SDValue();
1970     }
1971
1972     switch (NCC) {
1973     default: return SDValue();
1974     case ISD::SETNE: return LHS;
1975     case ISD::SETEQ: {
1976       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1977       LHSCC = ISD::getSetCCInverse(LHSCC,
1978                                   LHS.getOperand(0).getValueType().isInteger());
1979       if (DCI.isBeforeLegalizeOps() ||
1980           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1981         return DAG.getSelectCC(SDLoc(N),
1982                                LHS.getOperand(0),
1983                                LHS.getOperand(1),
1984                                LHS.getOperand(2),
1985                                LHS.getOperand(3),
1986                                LHSCC);
1987       break;
1988     }
1989     }
1990     return SDValue();
1991   }
1992
1993   case AMDGPUISD::EXPORT: {
1994     SDValue Arg = N->getOperand(1);
1995     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1996       break;
1997
1998     SDValue NewArgs[8] = {
1999       N->getOperand(0), // Chain
2000       SDValue(),
2001       N->getOperand(2), // ArrayBase
2002       N->getOperand(3), // Type
2003       N->getOperand(4), // SWZ_X
2004       N->getOperand(5), // SWZ_Y
2005       N->getOperand(6), // SWZ_Z
2006       N->getOperand(7) // SWZ_W
2007     };
2008     SDLoc DL(N);
2009     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2010     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2011   }
2012   case AMDGPUISD::TEXTURE_FETCH: {
2013     SDValue Arg = N->getOperand(1);
2014     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2015       break;
2016
2017     SDValue NewArgs[19] = {
2018       N->getOperand(0),
2019       N->getOperand(1),
2020       N->getOperand(2),
2021       N->getOperand(3),
2022       N->getOperand(4),
2023       N->getOperand(5),
2024       N->getOperand(6),
2025       N->getOperand(7),
2026       N->getOperand(8),
2027       N->getOperand(9),
2028       N->getOperand(10),
2029       N->getOperand(11),
2030       N->getOperand(12),
2031       N->getOperand(13),
2032       N->getOperand(14),
2033       N->getOperand(15),
2034       N->getOperand(16),
2035       N->getOperand(17),
2036       N->getOperand(18),
2037     };
2038     SDLoc DL(N);
2039     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2040     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2041   }
2042   }
2043
2044   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2045 }
2046
2047 static bool
2048 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2049             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2050   const R600InstrInfo *TII =
2051       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2052   if (!Src.isMachineOpcode())
2053     return false;
2054   switch (Src.getMachineOpcode()) {
2055   case AMDGPU::FNEG_R600:
2056     if (!Neg.getNode())
2057       return false;
2058     Src = Src.getOperand(0);
2059     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2060     return true;
2061   case AMDGPU::FABS_R600:
2062     if (!Abs.getNode())
2063       return false;
2064     Src = Src.getOperand(0);
2065     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2066     return true;
2067   case AMDGPU::CONST_COPY: {
2068     unsigned Opcode = ParentNode->getMachineOpcode();
2069     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2070
2071     if (!Sel.getNode())
2072       return false;
2073
2074     SDValue CstOffset = Src.getOperand(0);
2075     if (ParentNode->getValueType(0).isVector())
2076       return false;
2077
2078     // Gather constants values
2079     int SrcIndices[] = {
2080       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2081       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2082       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2083       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2084       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2085       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2086       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2087       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2088       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2089       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2090       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2091     };
2092     std::vector<unsigned> Consts;
2093     for (int OtherSrcIdx : SrcIndices) {
2094       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2095       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2096         continue;
2097       if (HasDst) {
2098         OtherSrcIdx--;
2099         OtherSelIdx--;
2100       }
2101       if (RegisterSDNode *Reg =
2102           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2103         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2104           ConstantSDNode *Cst
2105             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2106           Consts.push_back(Cst->getZExtValue());
2107         }
2108       }
2109     }
2110
2111     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2112     Consts.push_back(Cst->getZExtValue());
2113     if (!TII->fitsConstReadLimitations(Consts)) {
2114       return false;
2115     }
2116
2117     Sel = CstOffset;
2118     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2119     return true;
2120   }
2121   case AMDGPU::MOV_IMM_I32:
2122   case AMDGPU::MOV_IMM_F32: {
2123     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2124     uint64_t ImmValue = 0;
2125
2126
2127     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2128       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2129       float FloatValue = FPC->getValueAPF().convertToFloat();
2130       if (FloatValue == 0.0) {
2131         ImmReg = AMDGPU::ZERO;
2132       } else if (FloatValue == 0.5) {
2133         ImmReg = AMDGPU::HALF;
2134       } else if (FloatValue == 1.0) {
2135         ImmReg = AMDGPU::ONE;
2136       } else {
2137         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2138       }
2139     } else {
2140       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2141       uint64_t Value = C->getZExtValue();
2142       if (Value == 0) {
2143         ImmReg = AMDGPU::ZERO;
2144       } else if (Value == 1) {
2145         ImmReg = AMDGPU::ONE_INT;
2146       } else {
2147         ImmValue = Value;
2148       }
2149     }
2150
2151     // Check that we aren't already using an immediate.
2152     // XXX: It's possible for an instruction to have more than one
2153     // immediate operand, but this is not supported yet.
2154     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2155       if (!Imm.getNode())
2156         return false;
2157       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2158       assert(C);
2159       if (C->getZExtValue())
2160         return false;
2161       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2162     }
2163     Src = DAG.getRegister(ImmReg, MVT::i32);
2164     return true;
2165   }
2166   default:
2167     return false;
2168   }
2169 }
2170
2171
2172 /// \brief Fold the instructions after selecting them
2173 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2174                                             SelectionDAG &DAG) const {
2175   const R600InstrInfo *TII =
2176       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2177   if (!Node->isMachineOpcode())
2178     return Node;
2179   unsigned Opcode = Node->getMachineOpcode();
2180   SDValue FakeOp;
2181
2182   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2183
2184   if (Opcode == AMDGPU::DOT_4) {
2185     int OperandIdx[] = {
2186       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2187       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2188       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2189       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2190       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2191       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2192       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2193       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2194         };
2195     int NegIdx[] = {
2196       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2197       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2198       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2199       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2200       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2201       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2202       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2203       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2204     };
2205     int AbsIdx[] = {
2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2212       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2214     };
2215     for (unsigned i = 0; i < 8; i++) {
2216       if (OperandIdx[i] < 0)
2217         return Node;
2218       SDValue &Src = Ops[OperandIdx[i] - 1];
2219       SDValue &Neg = Ops[NegIdx[i] - 1];
2220       SDValue &Abs = Ops[AbsIdx[i] - 1];
2221       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2222       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2223       if (HasDst)
2224         SelIdx--;
2225       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2226       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2227         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2228     }
2229   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2230     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2231       SDValue &Src = Ops[i];
2232       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2233         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2234     }
2235   } else if (Opcode == AMDGPU::CLAMP_R600) {
2236     SDValue Src = Node->getOperand(0);
2237     if (!Src.isMachineOpcode() ||
2238         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2239       return Node;
2240     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2241         AMDGPU::OpName::clamp);
2242     if (ClampIdx < 0)
2243       return Node;
2244     SDLoc DL(Node);
2245     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2246     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2247     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2248                               Node->getVTList(), Ops);
2249   } else {
2250     if (!TII->hasInstrModifiers(Opcode))
2251       return Node;
2252     int OperandIdx[] = {
2253       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2254       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2255       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2256     };
2257     int NegIdx[] = {
2258       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2259       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2260       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2261     };
2262     int AbsIdx[] = {
2263       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2264       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2265       -1
2266     };
2267     for (unsigned i = 0; i < 3; i++) {
2268       if (OperandIdx[i] < 0)
2269         return Node;
2270       SDValue &Src = Ops[OperandIdx[i] - 1];
2271       SDValue &Neg = Ops[NegIdx[i] - 1];
2272       SDValue FakeAbs;
2273       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2274       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2275       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2276       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2277       if (HasDst) {
2278         SelIdx--;
2279         ImmIdx--;
2280       }
2281       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2282       SDValue &Imm = Ops[ImmIdx];
2283       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2284         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2285     }
2286   }
2287
2288   return Node;
2289 }