lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/CallingConvLower.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24 #include "llvm/IR/Argument.h"
  25 #include "llvm/IR/Function.h"
  26
  27 using namespace llvm;
  28
  29 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  30     AMDGPUTargetLowering(TM),
  31     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  36   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  37   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  38
  39   computeRegisterProperties();
  40
  41   // Set condition code actions
  42   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  43   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  44   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  45   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  46   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  54
  55   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  56   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  58   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  59
  60   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  61   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  62
  63   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  64   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  65
  66   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  67   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  68
  69   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  70
  71   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  72   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  73   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  74
  75   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  76   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  77
  78   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  79   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  80   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  81
  82   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  83   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  84   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  85   setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
  86   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  87   setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
  88
  89   // Legalize loads and stores to the private address space.
  90   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  91   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  92   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  93
  94   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
  95   // spaces, so it is custom lowered to handle those where it isn't.
  96   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
  97   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
  98   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  99   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
 100   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
 101   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
 102
 103   setOperationAction(ISD::STORE, MVT::i8, Custom);
 104   setOperationAction(ISD::STORE, MVT::i32, Custom);
 105   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 106   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 107   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 108   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 109
 110   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 111   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 112   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 113
 114   setTargetDAGCombine(ISD::FP_ROUND);
 115   setTargetDAGCombine(ISD::FP_TO_SINT);
 116   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 117   setTargetDAGCombine(ISD::SELECT_CC);
 118   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 119
 120   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 121
 122   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 123   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 124   setSchedulingPreference(Sched::Source);
 125 }
 126
 127 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 128     MachineInstr * MI, MachineBasicBlock * BB) const {
 129   MachineFunction * MF = BB->getParent();
 130   MachineRegisterInfo &MRI = MF->getRegInfo();
 131   MachineBasicBlock::iterator I = *MI;
 132   const R600InstrInfo *TII =
 133     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 134
 135   switch (MI->getOpcode()) {
 136   default:
 137     // Replace LDS_*_RET instruction that don't have any uses with the
 138     // equivalent LDS_*_NORET instruction.
 139     if (TII->isLDSRetInstr(MI->getOpcode())) {
 140       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 141       assert(DstIdx != -1);
 142       MachineInstrBuilder NewMI;
 143       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
 144         return BB;
 145
 146       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 147                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 148       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 149         NewMI.addOperand(MI->getOperand(i));
 150       }
 151     } else {
 152       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 153     }
 154     break;
 155   case AMDGPU::CLAMP_R600: {
 156     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 157                                                    AMDGPU::MOV,
 158                                                    MI->getOperand(0).getReg(),
 159                                                    MI->getOperand(1).getReg());
 160     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 161     break;
 162   }
 163
 164   case AMDGPU::FABS_R600: {
 165     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 166                                                     AMDGPU::MOV,
 167                                                     MI->getOperand(0).getReg(),
 168                                                     MI->getOperand(1).getReg());
 169     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 170     break;
 171   }
 172
 173   case AMDGPU::FNEG_R600: {
 174     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 175                                                     AMDGPU::MOV,
 176                                                     MI->getOperand(0).getReg(),
 177                                                     MI->getOperand(1).getReg());
 178     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 179     break;
 180   }
 181
 182   case AMDGPU::MASK_WRITE: {
 183     unsigned maskedRegister = MI->getOperand(0).getReg();
 184     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 185     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 186     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 187     break;
 188   }
 189
 190   case AMDGPU::MOV_IMM_F32:
 191     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 192                      MI->getOperand(1).getFPImm()->getValueAPF()
 193                          .bitcastToAPInt().getZExtValue());
 194     break;
 195   case AMDGPU::MOV_IMM_I32:
 196     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 197                      MI->getOperand(1).getImm());
 198     break;
 199   case AMDGPU::CONST_COPY: {
 200     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 201         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 202     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 203         MI->getOperand(1).getImm());
 204     break;
 205   }
 206
 207   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 208   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 209   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 210     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 211
 212     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 213             .addOperand(MI->getOperand(0))
 214             .addOperand(MI->getOperand(1))
 215             .addImm(EOP); // Set End of program bit
 216     break;
 217   }
 218
 219   case AMDGPU::TXD: {
 220     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 221     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 222     MachineOperand &RID = MI->getOperand(4);
 223     MachineOperand &SID = MI->getOperand(5);
 224     unsigned TextureId = MI->getOperand(6).getImm();
 225     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 226     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 227
 228     switch (TextureId) {
 229     case 5: // Rect
 230       CTX = CTY = 0;
 231       break;
 232     case 6: // Shadow1D
 233       SrcW = SrcZ;
 234       break;
 235     case 7: // Shadow2D
 236       SrcW = SrcZ;
 237       break;
 238     case 8: // ShadowRect
 239       CTX = CTY = 0;
 240       SrcW = SrcZ;
 241       break;
 242     case 9: // 1DArray
 243       SrcZ = SrcY;
 244       CTZ = 0;
 245       break;
 246     case 10: // 2DArray
 247       CTZ = 0;
 248       break;
 249     case 11: // Shadow1DArray
 250       SrcZ = SrcY;
 251       CTZ = 0;
 252       break;
 253     case 12: // Shadow2DArray
 254       CTZ = 0;
 255       break;
 256     }
 257     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 258             .addOperand(MI->getOperand(3))
 259             .addImm(SrcX)
 260             .addImm(SrcY)
 261             .addImm(SrcZ)
 262             .addImm(SrcW)
 263             .addImm(0)
 264             .addImm(0)
 265             .addImm(0)
 266             .addImm(0)
 267             .addImm(1)
 268             .addImm(2)
 269             .addImm(3)
 270             .addOperand(RID)
 271             .addOperand(SID)
 272             .addImm(CTX)
 273             .addImm(CTY)
 274             .addImm(CTZ)
 275             .addImm(CTW);
 276     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 277             .addOperand(MI->getOperand(2))
 278             .addImm(SrcX)
 279             .addImm(SrcY)
 280             .addImm(SrcZ)
 281             .addImm(SrcW)
 282             .addImm(0)
 283             .addImm(0)
 284             .addImm(0)
 285             .addImm(0)
 286             .addImm(1)
 287             .addImm(2)
 288             .addImm(3)
 289             .addOperand(RID)
 290             .addOperand(SID)
 291             .addImm(CTX)
 292             .addImm(CTY)
 293             .addImm(CTZ)
 294             .addImm(CTW);
 295     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 296             .addOperand(MI->getOperand(0))
 297             .addOperand(MI->getOperand(1))
 298             .addImm(SrcX)
 299             .addImm(SrcY)
 300             .addImm(SrcZ)
 301             .addImm(SrcW)
 302             .addImm(0)
 303             .addImm(0)
 304             .addImm(0)
 305             .addImm(0)
 306             .addImm(1)
 307             .addImm(2)
 308             .addImm(3)
 309             .addOperand(RID)
 310             .addOperand(SID)
 311             .addImm(CTX)
 312             .addImm(CTY)
 313             .addImm(CTZ)
 314             .addImm(CTW)
 315             .addReg(T0, RegState::Implicit)
 316             .addReg(T1, RegState::Implicit);
 317     break;
 318   }
 319
 320   case AMDGPU::TXD_SHADOW: {
 321     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 322     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 323     MachineOperand &RID = MI->getOperand(4);
 324     MachineOperand &SID = MI->getOperand(5);
 325     unsigned TextureId = MI->getOperand(6).getImm();
 326     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 327     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 328
 329     switch (TextureId) {
 330     case 5: // Rect
 331       CTX = CTY = 0;
 332       break;
 333     case 6: // Shadow1D
 334       SrcW = SrcZ;
 335       break;
 336     case 7: // Shadow2D
 337       SrcW = SrcZ;
 338       break;
 339     case 8: // ShadowRect
 340       CTX = CTY = 0;
 341       SrcW = SrcZ;
 342       break;
 343     case 9: // 1DArray
 344       SrcZ = SrcY;
 345       CTZ = 0;
 346       break;
 347     case 10: // 2DArray
 348       CTZ = 0;
 349       break;
 350     case 11: // Shadow1DArray
 351       SrcZ = SrcY;
 352       CTZ = 0;
 353       break;
 354     case 12: // Shadow2DArray
 355       CTZ = 0;
 356       break;
 357     }
 358
 359     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 360             .addOperand(MI->getOperand(3))
 361             .addImm(SrcX)
 362             .addImm(SrcY)
 363             .addImm(SrcZ)
 364             .addImm(SrcW)
 365             .addImm(0)
 366             .addImm(0)
 367             .addImm(0)
 368             .addImm(0)
 369             .addImm(1)
 370             .addImm(2)
 371             .addImm(3)
 372             .addOperand(RID)
 373             .addOperand(SID)
 374             .addImm(CTX)
 375             .addImm(CTY)
 376             .addImm(CTZ)
 377             .addImm(CTW);
 378     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 379             .addOperand(MI->getOperand(2))
 380             .addImm(SrcX)
 381             .addImm(SrcY)
 382             .addImm(SrcZ)
 383             .addImm(SrcW)
 384             .addImm(0)
 385             .addImm(0)
 386             .addImm(0)
 387             .addImm(0)
 388             .addImm(1)
 389             .addImm(2)
 390             .addImm(3)
 391             .addOperand(RID)
 392             .addOperand(SID)
 393             .addImm(CTX)
 394             .addImm(CTY)
 395             .addImm(CTZ)
 396             .addImm(CTW);
 397     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 398             .addOperand(MI->getOperand(0))
 399             .addOperand(MI->getOperand(1))
 400             .addImm(SrcX)
 401             .addImm(SrcY)
 402             .addImm(SrcZ)
 403             .addImm(SrcW)
 404             .addImm(0)
 405             .addImm(0)
 406             .addImm(0)
 407             .addImm(0)
 408             .addImm(1)
 409             .addImm(2)
 410             .addImm(3)
 411             .addOperand(RID)
 412             .addOperand(SID)
 413             .addImm(CTX)
 414             .addImm(CTY)
 415             .addImm(CTZ)
 416             .addImm(CTW)
 417             .addReg(T0, RegState::Implicit)
 418             .addReg(T1, RegState::Implicit);
 419     break;
 420   }
 421
 422   case AMDGPU::BRANCH:
 423       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 424               .addOperand(MI->getOperand(0));
 425       break;
 426
 427   case AMDGPU::BRANCH_COND_f32: {
 428     MachineInstr *NewMI =
 429       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 430               AMDGPU::PREDICATE_BIT)
 431               .addOperand(MI->getOperand(1))
 432               .addImm(OPCODE_IS_NOT_ZERO)
 433               .addImm(0); // Flags
 434     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 435     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 436             .addOperand(MI->getOperand(0))
 437             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 438     break;
 439   }
 440
 441   case AMDGPU::BRANCH_COND_i32: {
 442     MachineInstr *NewMI =
 443       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 444             AMDGPU::PREDICATE_BIT)
 445             .addOperand(MI->getOperand(1))
 446             .addImm(OPCODE_IS_NOT_ZERO_INT)
 447             .addImm(0); // Flags
 448     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 449     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 450            .addOperand(MI->getOperand(0))
 451             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 452     break;
 453   }
 454
 455   case AMDGPU::EG_ExportSwz:
 456   case AMDGPU::R600_ExportSwz: {
 457     // Instruction is left unmodified if its not the last one of its type
 458     bool isLastInstructionOfItsType = true;
 459     unsigned InstExportType = MI->getOperand(1).getImm();
 460     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 461          EndBlock = BB->end(); NextExportInst != EndBlock;
 462          NextExportInst = llvm::next(NextExportInst)) {
 463       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 464           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 465         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 466             .getImm();
 467         if (CurrentInstExportType == InstExportType) {
 468           isLastInstructionOfItsType = false;
 469           break;
 470         }
 471       }
 472     }
 473     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 474     if (!EOP && !isLastInstructionOfItsType)
 475       return BB;
 476     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 477     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 478             .addOperand(MI->getOperand(0))
 479             .addOperand(MI->getOperand(1))
 480             .addOperand(MI->getOperand(2))
 481             .addOperand(MI->getOperand(3))
 482             .addOperand(MI->getOperand(4))
 483             .addOperand(MI->getOperand(5))
 484             .addOperand(MI->getOperand(6))
 485             .addImm(CfInst)
 486             .addImm(EOP);
 487     break;
 488   }
 489   case AMDGPU::RETURN: {
 490     // RETURN instructions must have the live-out registers as implicit uses,
 491     // otherwise they appear dead.
 492     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 493     MachineInstrBuilder MIB(*MF, MI);
 494     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 495       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 496     return BB;
 497   }
 498   }
 499
 500   MI->eraseFromParent();
 501   return BB;
 502 }
 503
 504 //===----------------------------------------------------------------------===//
 505 // Custom DAG Lowering Operations
 506 //===----------------------------------------------------------------------===//
 507
 508 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 509   MachineFunction &MF = DAG.getMachineFunction();
 510   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 511   switch (Op.getOpcode()) {
 512   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 513   case ISD::FCOS:
 514   case ISD::FSIN: return LowerTrig(Op, DAG);
 515   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 516   case ISD::STORE: return LowerSTORE(Op, DAG);
 517   case ISD::LOAD: return LowerLOAD(Op, DAG);
 518   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 519   case ISD::INTRINSIC_VOID: {
 520     SDValue Chain = Op.getOperand(0);
 521     unsigned IntrinsicID =
 522                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 523     switch (IntrinsicID) {
 524     case AMDGPUIntrinsic::AMDGPU_store_output: {
 525       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 526       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 527       MFI->LiveOuts.push_back(Reg);
 528       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 529     }
 530     case AMDGPUIntrinsic::R600_store_swizzle: {
 531       const SDValue Args[8] = {
 532         Chain,
 533         Op.getOperand(2), // Export Value
 534         Op.getOperand(3), // ArrayBase
 535         Op.getOperand(4), // Type
 536         DAG.getConstant(0, MVT::i32), // SWZ_X
 537         DAG.getConstant(1, MVT::i32), // SWZ_Y
 538         DAG.getConstant(2, MVT::i32), // SWZ_Z
 539         DAG.getConstant(3, MVT::i32) // SWZ_W
 540       };
 541       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 542           Args, 8);
 543     }
 544
 545     // default for switch(IntrinsicID)
 546     default: break;
 547     }
 548     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 549     break;
 550   }
 551   case ISD::INTRINSIC_WO_CHAIN: {
 552     unsigned IntrinsicID =
 553                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 554     EVT VT = Op.getValueType();
 555     SDLoc DL(Op);
 556     switch(IntrinsicID) {
 557     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 558     case AMDGPUIntrinsic::R600_load_input: {
 559       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 560       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 561       MachineFunction &MF = DAG.getMachineFunction();
 562       MachineRegisterInfo &MRI = MF.getRegInfo();
 563       MRI.addLiveIn(Reg);
 564       return DAG.getCopyFromReg(DAG.getEntryNode(),
 565           SDLoc(DAG.getEntryNode()), Reg, VT);
 566     }
 567
 568     case AMDGPUIntrinsic::R600_interp_input: {
 569       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 570       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 571       MachineSDNode *interp;
 572       if (ijb < 0) {
 573         const MachineFunction &MF = DAG.getMachineFunction();
 574         const R600InstrInfo *TII =
 575           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 576         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 577             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 578         return DAG.getTargetExtractSubreg(
 579             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 580             DL, MVT::f32, SDValue(interp, 0));
 581       }
 582       MachineFunction &MF = DAG.getMachineFunction();
 583       MachineRegisterInfo &MRI = MF.getRegInfo();
 584       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 585       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 586       MRI.addLiveIn(RegisterI);
 587       MRI.addLiveIn(RegisterJ);
 588       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 589           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 590       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 591           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 592
 593       if (slot % 4 < 2)
 594         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 595             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 596             RegisterJNode, RegisterINode);
 597       else
 598         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 599             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 600             RegisterJNode, RegisterINode);
 601       return SDValue(interp, slot % 2);
 602     }
 603     case AMDGPUIntrinsic::R600_interp_xy:
 604     case AMDGPUIntrinsic::R600_interp_zw: {
 605       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 606       MachineSDNode *interp;
 607       SDValue RegisterINode = Op.getOperand(2);
 608       SDValue RegisterJNode = Op.getOperand(3);
 609
 610       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 611         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 612             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 613             RegisterJNode, RegisterINode);
 614       else
 615         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 616             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 617             RegisterJNode, RegisterINode);
 618       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 619           SDValue(interp, 0), SDValue(interp, 1));
 620     }
 621     case AMDGPUIntrinsic::R600_tex:
 622     case AMDGPUIntrinsic::R600_texc:
 623     case AMDGPUIntrinsic::R600_txl:
 624     case AMDGPUIntrinsic::R600_txlc:
 625     case AMDGPUIntrinsic::R600_txb:
 626     case AMDGPUIntrinsic::R600_txbc:
 627     case AMDGPUIntrinsic::R600_txf:
 628     case AMDGPUIntrinsic::R600_txq:
 629     case AMDGPUIntrinsic::R600_ddx:
 630     case AMDGPUIntrinsic::R600_ddy:
 631     case AMDGPUIntrinsic::R600_ldptr: {
 632       unsigned TextureOp;
 633       switch (IntrinsicID) {
 634       case AMDGPUIntrinsic::R600_tex:
 635         TextureOp = 0;
 636         break;
 637       case AMDGPUIntrinsic::R600_texc:
 638         TextureOp = 1;
 639         break;
 640       case AMDGPUIntrinsic::R600_txl:
 641         TextureOp = 2;
 642         break;
 643       case AMDGPUIntrinsic::R600_txlc:
 644         TextureOp = 3;
 645         break;
 646       case AMDGPUIntrinsic::R600_txb:
 647         TextureOp = 4;
 648         break;
 649       case AMDGPUIntrinsic::R600_txbc:
 650         TextureOp = 5;
 651         break;
 652       case AMDGPUIntrinsic::R600_txf:
 653         TextureOp = 6;
 654         break;
 655       case AMDGPUIntrinsic::R600_txq:
 656         TextureOp = 7;
 657         break;
 658       case AMDGPUIntrinsic::R600_ddx:
 659         TextureOp = 8;
 660         break;
 661       case AMDGPUIntrinsic::R600_ddy:
 662         TextureOp = 9;
 663         break;
 664       case AMDGPUIntrinsic::R600_ldptr:
 665         TextureOp = 10;
 666         break;
 667       default:
 668         llvm_unreachable("Unknow Texture Operation");
 669       }
 670
 671       SDValue TexArgs[19] = {
 672         DAG.getConstant(TextureOp, MVT::i32),
 673         Op.getOperand(1),
 674         DAG.getConstant(0, MVT::i32),
 675         DAG.getConstant(1, MVT::i32),
 676         DAG.getConstant(2, MVT::i32),
 677         DAG.getConstant(3, MVT::i32),
 678         Op.getOperand(2),
 679         Op.getOperand(3),
 680         Op.getOperand(4),
 681         DAG.getConstant(0, MVT::i32),
 682         DAG.getConstant(1, MVT::i32),
 683         DAG.getConstant(2, MVT::i32),
 684         DAG.getConstant(3, MVT::i32),
 685         Op.getOperand(5),
 686         Op.getOperand(6),
 687         Op.getOperand(7),
 688         Op.getOperand(8),
 689         Op.getOperand(9),
 690         Op.getOperand(10)
 691       };
 692       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 693     }
 694     case AMDGPUIntrinsic::AMDGPU_dp4: {
 695       SDValue Args[8] = {
 696       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 697           DAG.getConstant(0, MVT::i32)),
 698       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 699           DAG.getConstant(0, MVT::i32)),
 700       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 701           DAG.getConstant(1, MVT::i32)),
 702       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 703           DAG.getConstant(1, MVT::i32)),
 704       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 705           DAG.getConstant(2, MVT::i32)),
 706       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 707           DAG.getConstant(2, MVT::i32)),
 708       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 709           DAG.getConstant(3, MVT::i32)),
 710       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 711           DAG.getConstant(3, MVT::i32))
 712       };
 713       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 714     }
 715
 716     case Intrinsic::r600_read_ngroups_x:
 717       return LowerImplicitParameter(DAG, VT, DL, 0);
 718     case Intrinsic::r600_read_ngroups_y:
 719       return LowerImplicitParameter(DAG, VT, DL, 1);
 720     case Intrinsic::r600_read_ngroups_z:
 721       return LowerImplicitParameter(DAG, VT, DL, 2);
 722     case Intrinsic::r600_read_global_size_x:
 723       return LowerImplicitParameter(DAG, VT, DL, 3);
 724     case Intrinsic::r600_read_global_size_y:
 725       return LowerImplicitParameter(DAG, VT, DL, 4);
 726     case Intrinsic::r600_read_global_size_z:
 727       return LowerImplicitParameter(DAG, VT, DL, 5);
 728     case Intrinsic::r600_read_local_size_x:
 729       return LowerImplicitParameter(DAG, VT, DL, 6);
 730     case Intrinsic::r600_read_local_size_y:
 731       return LowerImplicitParameter(DAG, VT, DL, 7);
 732     case Intrinsic::r600_read_local_size_z:
 733       return LowerImplicitParameter(DAG, VT, DL, 8);
 734
 735     case Intrinsic::r600_read_tgid_x:
 736       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 737                                   AMDGPU::T1_X, VT);
 738     case Intrinsic::r600_read_tgid_y:
 739       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 740                                   AMDGPU::T1_Y, VT);
 741     case Intrinsic::r600_read_tgid_z:
 742       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 743                                   AMDGPU::T1_Z, VT);
 744     case Intrinsic::r600_read_tidig_x:
 745       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 746                                   AMDGPU::T0_X, VT);
 747     case Intrinsic::r600_read_tidig_y:
 748       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 749                                   AMDGPU::T0_Y, VT);
 750     case Intrinsic::r600_read_tidig_z:
 751       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 752                                   AMDGPU::T0_Z, VT);
 753     }
 754     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 755     break;
 756   }
 757   } // end switch(Op.getOpcode())
 758   return SDValue();
 759 }
 760
 761 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 762                                             SmallVectorImpl<SDValue> &Results,
 763                                             SelectionDAG &DAG) const {
 764   switch (N->getOpcode()) {
 765   default: return;
 766   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 767     return;
 768   case ISD::LOAD: {
 769     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 770     Results.push_back(SDValue(Node, 0));
 771     Results.push_back(SDValue(Node, 1));
 772     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 773     // function
 774     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 775     return;
 776   }
 777   case ISD::STORE:
 778     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 779     Results.push_back(SDValue(Node, 0));
 780     return;
 781   }
 782 }
 783
 784 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 785   // On hw >= R700, COS/SIN input must be between -1. and 1.
 786   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 787   EVT VT = Op.getValueType();
 788   SDValue Arg = Op.getOperand(0);
 789   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 790       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 791         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 792           DAG.getConstantFP(0.15915494309, MVT::f32)),
 793         DAG.getConstantFP(0.5, MVT::f32)));
 794   unsigned TrigNode;
 795   switch (Op.getOpcode()) {
 796   case ISD::FCOS:
 797     TrigNode = AMDGPUISD::COS_HW;
 798     break;
 799   case ISD::FSIN:
 800     TrigNode = AMDGPUISD::SIN_HW;
 801     break;
 802   default:
 803     llvm_unreachable("Wrong trig opcode");
 804   }
 805   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 806       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 807         DAG.getConstantFP(-0.5, MVT::f32)));
 808   if (Gen >= AMDGPUSubtarget::R700)
 809     return TrigVal;
 810   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 811   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 812       DAG.getConstantFP(3.14159265359, MVT::f32));
 813 }
 814
 815 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 816   return DAG.getNode(
 817       ISD::SETCC,
 818       SDLoc(Op),
 819       MVT::i1,
 820       Op, DAG.getConstantFP(0.0f, MVT::f32),
 821       DAG.getCondCode(ISD::SETNE)
 822       );
 823 }
 824
 825 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 826                                                    SDLoc DL,
 827                                                    unsigned DwordOffset) const {
 828   unsigned ByteOffset = DwordOffset * 4;
 829   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 830                                       AMDGPUAS::CONSTANT_BUFFER_0);
 831
 832   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 833   assert(isInt<16>(ByteOffset));
 834
 835   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 836                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 837                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 838                      false, false, false, 0);
 839 }
 840
 841 bool R600TargetLowering::isZero(SDValue Op) const {
 842   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 843     return Cst->isNullValue();
 844   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 845     return CstFP->isZero();
 846   } else {
 847     return false;
 848   }
 849 }
 850
 851 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 852   SDLoc DL(Op);
 853   EVT VT = Op.getValueType();
 854
 855   SDValue LHS = Op.getOperand(0);
 856   SDValue RHS = Op.getOperand(1);
 857   SDValue True = Op.getOperand(2);
 858   SDValue False = Op.getOperand(3);
 859   SDValue CC = Op.getOperand(4);
 860   SDValue Temp;
 861
 862   // LHS and RHS are guaranteed to be the same value type
 863   EVT CompareVT = LHS.getValueType();
 864
 865   // Check if we can lower this to a native operation.
 866
 867   // Try to lower to a SET* instruction:
 868   //
 869   // SET* can match the following patterns:
 870   //
 871   // select_cc f32, f32, -1,  0, cc_supported
 872   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
 873   // select_cc i32, i32, -1,  0, cc_supported
 874   //
 875
 876   // Move hardware True/False values to the correct operand.
 877   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 878   ISD::CondCode InverseCC =
 879      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 880   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 881     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
 882       std::swap(False, True);
 883       CC = DAG.getCondCode(InverseCC);
 884     } else {
 885       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
 886       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
 887         std::swap(False, True);
 888         std::swap(LHS, RHS);
 889         CC = DAG.getCondCode(SwapInvCC);
 890       }
 891     }
 892   }
 893
 894   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 895       (CompareVT == VT || VT == MVT::i32)) {
 896     // This can be matched by a SET* instruction.
 897     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 898   }
 899
 900   // Try to lower to a CND* instruction:
 901   //
 902   // CND* can match the following patterns:
 903   //
 904   // select_cc f32, 0.0, f32, f32, cc_supported
 905   // select_cc f32, 0.0, i32, i32, cc_supported
 906   // select_cc i32, 0,   f32, f32, cc_supported
 907   // select_cc i32, 0,   i32, i32, cc_supported
 908   //
 909
 910   // Try to move the zero value to the RHS
 911   if (isZero(LHS)) {
 912     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 913     // Try swapping the operands
 914     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
 915     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
 916       std::swap(LHS, RHS);
 917       CC = DAG.getCondCode(CCSwapped);
 918     } else {
 919       // Try inverting the conditon and then swapping the operands
 920       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
 921       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
 922       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
 923         std::swap(True, False);
 924         std::swap(LHS, RHS);
 925         CC = DAG.getCondCode(CCSwapped);
 926       }
 927     }
 928   }
 929   if (isZero(RHS)) {
 930     SDValue Cond = LHS;
 931     SDValue Zero = RHS;
 932     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 933     if (CompareVT != VT) {
 934       // Bitcast True / False to the correct types.  This will end up being
 935       // a nop, but it allows us to define only a single pattern in the
 936       // .TD files for each CND* instruction rather than having to have
 937       // one pattern for integer True/False and one for fp True/False
 938       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 939       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 940     }
 941
 942     switch (CCOpcode) {
 943     case ISD::SETONE:
 944     case ISD::SETUNE:
 945     case ISD::SETNE:
 946       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 947       Temp = True;
 948       True = False;
 949       False = Temp;
 950       break;
 951     default:
 952       break;
 953     }
 954     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 955         Cond, Zero,
 956         True, False,
 957         DAG.getCondCode(CCOpcode));
 958     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 959   }
 960
 961
 962   // Possible Min/Max pattern
 963   SDValue MinMax = LowerMinMax(Op, DAG);
 964   if (MinMax.getNode()) {
 965     return MinMax;
 966   }
 967
 968   // If we make it this for it means we have no native instructions to handle
 969   // this SELECT_CC, so we must lower it.
 970   SDValue HWTrue, HWFalse;
 971
 972   if (CompareVT == MVT::f32) {
 973     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 974     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 975   } else if (CompareVT == MVT::i32) {
 976     HWTrue = DAG.getConstant(-1, CompareVT);
 977     HWFalse = DAG.getConstant(0, CompareVT);
 978   }
 979   else {
 980     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
 981   }
 982
 983   // Lower this unsupported SELECT_CC into a combination of two supported
 984   // SELECT_CC operations.
 985   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 986
 987   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 988       Cond, HWFalse,
 989       True, False,
 990       DAG.getCondCode(ISD::SETNE));
 991 }
 992
 993 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 994 /// convert these pointers to a register index.  Each register holds
 995 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 996 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 997 /// for indirect addressing.
 998 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 999                                                unsigned StackWidth,
1000                                                SelectionDAG &DAG) const {
1001   unsigned SRLPad;
1002   switch(StackWidth) {
1003   case 1:
1004     SRLPad = 2;
1005     break;
1006   case 2:
1007     SRLPad = 3;
1008     break;
1009   case 4:
1010     SRLPad = 4;
1011     break;
1012   default: llvm_unreachable("Invalid stack width");
1013   }
1014
1015   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1016                      DAG.getConstant(SRLPad, MVT::i32));
1017 }
1018
1019 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1020                                          unsigned ElemIdx,
1021                                          unsigned &Channel,
1022                                          unsigned &PtrIncr) const {
1023   switch (StackWidth) {
1024   default:
1025   case 1:
1026     Channel = 0;
1027     if (ElemIdx > 0) {
1028       PtrIncr = 1;
1029     } else {
1030       PtrIncr = 0;
1031     }
1032     break;
1033   case 2:
1034     Channel = ElemIdx % 2;
1035     if (ElemIdx == 2) {
1036       PtrIncr = 1;
1037     } else {
1038       PtrIncr = 0;
1039     }
1040     break;
1041   case 4:
1042     Channel = ElemIdx;
1043     PtrIncr = 0;
1044     break;
1045   }
1046 }
1047
1048 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1049   SDLoc DL(Op);
1050   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1051   SDValue Chain = Op.getOperand(0);
1052   SDValue Value = Op.getOperand(1);
1053   SDValue Ptr = Op.getOperand(2);
1054
1055   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1056   if (Result.getNode()) {
1057     return Result;
1058   }
1059
1060   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1061     if (StoreNode->isTruncatingStore()) {
1062       EVT VT = Value.getValueType();
1063       assert(VT.bitsLE(MVT::i32));
1064       EVT MemVT = StoreNode->getMemoryVT();
1065       SDValue MaskConstant;
1066       if (MemVT == MVT::i8) {
1067         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1068       } else {
1069         assert(MemVT == MVT::i16);
1070         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1071       }
1072       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1073                                       DAG.getConstant(2, MVT::i32));
1074       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1075                                       DAG.getConstant(0x00000003, VT));
1076       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1077       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1078                                    DAG.getConstant(3, VT));
1079       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1080       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1081       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1082       // vector instead.
1083       SDValue Src[4] = {
1084         ShiftedValue,
1085         DAG.getConstant(0, MVT::i32),
1086         DAG.getConstant(0, MVT::i32),
1087         Mask
1088       };
1089       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1090       SDValue Args[3] = { Chain, Input, DWordAddr };
1091       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1092                                      Op->getVTList(), Args, 3, MemVT,
1093                                      StoreNode->getMemOperand());
1094     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1095                Value.getValueType().bitsGE(MVT::i32)) {
1096       // Convert pointer from byte address to dword address.
1097       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1098                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1099                                     Ptr, DAG.getConstant(2, MVT::i32)));
1100
1101       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1102         llvm_unreachable("Truncated and indexed stores not supported yet");
1103       } else {
1104         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1105       }
1106       return Chain;
1107     }
1108   }
1109
1110   EVT ValueVT = Value.getValueType();
1111
1112   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1113     return SDValue();
1114   }
1115
1116   // Lowering for indirect addressing
1117
1118   const MachineFunction &MF = DAG.getMachineFunction();
1119   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1120                                          getTargetMachine().getFrameLowering());
1121   unsigned StackWidth = TFL->getStackWidth(MF);
1122
1123   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1124
1125   if (ValueVT.isVector()) {
1126     unsigned NumElemVT = ValueVT.getVectorNumElements();
1127     EVT ElemVT = ValueVT.getVectorElementType();
1128     SDValue Stores[4];
1129
1130     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1131                                       "vector width in load");
1132
1133     for (unsigned i = 0; i < NumElemVT; ++i) {
1134       unsigned Channel, PtrIncr;
1135       getStackAddress(StackWidth, i, Channel, PtrIncr);
1136       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1137                         DAG.getConstant(PtrIncr, MVT::i32));
1138       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1139                                  Value, DAG.getConstant(i, MVT::i32));
1140
1141       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1142                               Chain, Elem, Ptr,
1143                               DAG.getTargetConstant(Channel, MVT::i32));
1144     }
1145      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1146    } else {
1147     if (ValueVT == MVT::i8) {
1148       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1149     }
1150     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1151     DAG.getTargetConstant(0, MVT::i32)); // Channel
1152   }
1153
1154   return Chain;
1155 }
1156
1157 // return (512 + (kc_bank << 12)
1158 static int
1159 ConstantAddressBlock(unsigned AddressSpace) {
1160   switch (AddressSpace) {
1161   case AMDGPUAS::CONSTANT_BUFFER_0:
1162     return 512;
1163   case AMDGPUAS::CONSTANT_BUFFER_1:
1164     return 512 + 4096;
1165   case AMDGPUAS::CONSTANT_BUFFER_2:
1166     return 512 + 4096 * 2;
1167   case AMDGPUAS::CONSTANT_BUFFER_3:
1168     return 512 + 4096 * 3;
1169   case AMDGPUAS::CONSTANT_BUFFER_4:
1170     return 512 + 4096 * 4;
1171   case AMDGPUAS::CONSTANT_BUFFER_5:
1172     return 512 + 4096 * 5;
1173   case AMDGPUAS::CONSTANT_BUFFER_6:
1174     return 512 + 4096 * 6;
1175   case AMDGPUAS::CONSTANT_BUFFER_7:
1176     return 512 + 4096 * 7;
1177   case AMDGPUAS::CONSTANT_BUFFER_8:
1178     return 512 + 4096 * 8;
1179   case AMDGPUAS::CONSTANT_BUFFER_9:
1180     return 512 + 4096 * 9;
1181   case AMDGPUAS::CONSTANT_BUFFER_10:
1182     return 512 + 4096 * 10;
1183   case AMDGPUAS::CONSTANT_BUFFER_11:
1184     return 512 + 4096 * 11;
1185   case AMDGPUAS::CONSTANT_BUFFER_12:
1186     return 512 + 4096 * 12;
1187   case AMDGPUAS::CONSTANT_BUFFER_13:
1188     return 512 + 4096 * 13;
1189   case AMDGPUAS::CONSTANT_BUFFER_14:
1190     return 512 + 4096 * 14;
1191   case AMDGPUAS::CONSTANT_BUFFER_15:
1192     return 512 + 4096 * 15;
1193   default:
1194     return -1;
1195   }
1196 }
1197
1198 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1199 {
1200   EVT VT = Op.getValueType();
1201   SDLoc DL(Op);
1202   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1203   SDValue Chain = Op.getOperand(0);
1204   SDValue Ptr = Op.getOperand(1);
1205   SDValue LoweredLoad;
1206
1207   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1208     SDValue MergedValues[2] = {
1209       SplitVectorLoad(Op, DAG),
1210       Chain
1211     };
1212     return DAG.getMergeValues(MergedValues, 2, DL);
1213   }
1214
1215   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1216   if (ConstantBlock > -1 &&
1217       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1218        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1219     SDValue Result;
1220     if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
1221         isa<Constant>(LoadNode->getSrcValue()) ||
1222         isa<ConstantSDNode>(Ptr)) {
1223       SDValue Slots[4];
1224       for (unsigned i = 0; i < 4; i++) {
1225         // We want Const position encoded with the following formula :
1226         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1227         // const_index is Ptr computed by llvm using an alignment of 16.
1228         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1229         // then div by 4 at the ISel step
1230         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1231             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1232         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1233       }
1234       EVT NewVT = MVT::v4i32;
1235       unsigned NumElements = 4;
1236       if (VT.isVector()) {
1237         NewVT = VT;
1238         NumElements = VT.getVectorNumElements();
1239       }
1240       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1241     } else {
1242       // non-constant ptr can't be folded, keeps it as a v4f32 load
1243       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1244           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1245           DAG.getConstant(LoadNode->getAddressSpace() -
1246                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1247           );
1248     }
1249
1250     if (!VT.isVector()) {
1251       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1252           DAG.getConstant(0, MVT::i32));
1253     }
1254
1255     SDValue MergedValues[2] = {
1256         Result,
1257         Chain
1258     };
1259     return DAG.getMergeValues(MergedValues, 2, DL);
1260   }
1261
1262   // For most operations returning SDValue() will result in the node being
1263   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1264   // need to manually expand loads that may be legal in some address spaces and
1265   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1266   // compute shaders, since the data is sign extended when it is uploaded to the
1267   // buffer. However SEXT loads from other address spaces are not supported, so
1268   // we need to expand them here.
1269   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1270     EVT MemVT = LoadNode->getMemoryVT();
1271     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1272     SDValue ShiftAmount =
1273           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1274     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1275                                   LoadNode->getPointerInfo(), MemVT,
1276                                   LoadNode->isVolatile(),
1277                                   LoadNode->isNonTemporal(),
1278                                   LoadNode->getAlignment());
1279     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1280     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1281
1282     SDValue MergedValues[2] = { Sra, Chain };
1283     return DAG.getMergeValues(MergedValues, 2, DL);
1284   }
1285
1286   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1287     return SDValue();
1288   }
1289
1290   // Lowering for indirect addressing
1291   const MachineFunction &MF = DAG.getMachineFunction();
1292   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1293                                          getTargetMachine().getFrameLowering());
1294   unsigned StackWidth = TFL->getStackWidth(MF);
1295
1296   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1297
1298   if (VT.isVector()) {
1299     unsigned NumElemVT = VT.getVectorNumElements();
1300     EVT ElemVT = VT.getVectorElementType();
1301     SDValue Loads[4];
1302
1303     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1304                                       "vector width in load");
1305
1306     for (unsigned i = 0; i < NumElemVT; ++i) {
1307       unsigned Channel, PtrIncr;
1308       getStackAddress(StackWidth, i, Channel, PtrIncr);
1309       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1310                         DAG.getConstant(PtrIncr, MVT::i32));
1311       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1312                              Chain, Ptr,
1313                              DAG.getTargetConstant(Channel, MVT::i32),
1314                              Op.getOperand(2));
1315     }
1316     for (unsigned i = NumElemVT; i < 4; ++i) {
1317       Loads[i] = DAG.getUNDEF(ElemVT);
1318     }
1319     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1320     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1321   } else {
1322     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1323                               Chain, Ptr,
1324                               DAG.getTargetConstant(0, MVT::i32), // Channel
1325                               Op.getOperand(2));
1326   }
1327
1328   SDValue Ops[2];
1329   Ops[0] = LoweredLoad;
1330   Ops[1] = Chain;
1331
1332   return DAG.getMergeValues(Ops, 2, DL);
1333 }
1334
1335 /// XXX Only kernel functions are supported, so we can assume for now that
1336 /// every function is a kernel function, but in the future we should use
1337 /// separate calling conventions for kernel and non-kernel functions.
1338 SDValue R600TargetLowering::LowerFormalArguments(
1339                                       SDValue Chain,
1340                                       CallingConv::ID CallConv,
1341                                       bool isVarArg,
1342                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1343                                       SDLoc DL, SelectionDAG &DAG,
1344                                       SmallVectorImpl<SDValue> &InVals) const {
1345   SmallVector<CCValAssign, 16> ArgLocs;
1346   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1347                  getTargetMachine(), ArgLocs, *DAG.getContext());
1348   MachineFunction &MF = DAG.getMachineFunction();
1349   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1350
1351   SmallVector<ISD::InputArg, 8> LocalIns;
1352
1353   getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
1354                           LocalIns);
1355
1356   AnalyzeFormalArguments(CCInfo, LocalIns);
1357
1358   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1359     CCValAssign &VA = ArgLocs[i];
1360     EVT VT = Ins[i].VT;
1361     EVT MemVT = LocalIns[i].VT;
1362
1363     if (ShaderType != ShaderType::COMPUTE) {
1364       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1365       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1366       InVals.push_back(Register);
1367       continue;
1368     }
1369
1370     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1371                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1372
1373     // The first 36 bytes of the input buffer contains information about
1374     // thread group and global sizes.
1375     SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
1376                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1377                                  MachinePointerInfo(UndefValue::get(PtrTy)),
1378                                  MemVT, false, false, 4);
1379                                  // 4 is the prefered alignment for
1380                                  // the CONSTANT memory space.
1381     InVals.push_back(Arg);
1382   }
1383   return Chain;
1384 }
1385
1386 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1387    if (!VT.isVector()) return MVT::i32;
1388    return VT.changeVectorElementTypeToInteger();
1389 }
1390
1391 static SDValue
1392 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1393                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1394   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1395   assert(RemapSwizzle.empty());
1396   SDValue NewBldVec[4] = {
1397       VectorEntry.getOperand(0),
1398       VectorEntry.getOperand(1),
1399       VectorEntry.getOperand(2),
1400       VectorEntry.getOperand(3)
1401   };
1402
1403   for (unsigned i = 0; i < 4; i++) {
1404     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1405       // We mask write here to teach later passes that the ith element of this
1406       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1407       // break false dependencies and additionnaly make assembly easier to read.
1408       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1409     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1410       if (C->isZero()) {
1411         RemapSwizzle[i] = 4; // SEL_0
1412         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1413       } else if (C->isExactlyValue(1.0)) {
1414         RemapSwizzle[i] = 5; // SEL_1
1415         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1416       }
1417     }
1418
1419     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1420       continue;
1421     for (unsigned j = 0; j < i; j++) {
1422       if (NewBldVec[i] == NewBldVec[j]) {
1423         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1424         RemapSwizzle[i] = j;
1425         break;
1426       }
1427     }
1428   }
1429
1430   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1431       VectorEntry.getValueType(), NewBldVec, 4);
1432 }
1433
1434 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1435                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1436   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1437   assert(RemapSwizzle.empty());
1438   SDValue NewBldVec[4] = {
1439       VectorEntry.getOperand(0),
1440       VectorEntry.getOperand(1),
1441       VectorEntry.getOperand(2),
1442       VectorEntry.getOperand(3)
1443   };
1444   bool isUnmovable[4] = { false, false, false, false };
1445   for (unsigned i = 0; i < 4; i++) {
1446     RemapSwizzle[i] = i;
1447     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1448       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1449           ->getZExtValue();
1450       if (i == Idx)
1451         isUnmovable[Idx] = true;
1452     }
1453   }
1454
1455   for (unsigned i = 0; i < 4; i++) {
1456     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1457       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1458           ->getZExtValue();
1459       if (isUnmovable[Idx])
1460         continue;
1461       // Swap i and Idx
1462       std::swap(NewBldVec[Idx], NewBldVec[i]);
1463       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1464       break;
1465     }
1466   }
1467
1468   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1469       VectorEntry.getValueType(), NewBldVec, 4);
1470 }
1471
1472
1473 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1474 SDValue Swz[4], SelectionDAG &DAG) const {
1475   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1476   // Old -> New swizzle values
1477   DenseMap<unsigned, unsigned> SwizzleRemap;
1478
1479   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1480   for (unsigned i = 0; i < 4; i++) {
1481     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1482     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1483       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1484   }
1485
1486   SwizzleRemap.clear();
1487   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1488   for (unsigned i = 0; i < 4; i++) {
1489     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1490     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1491       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1492   }
1493
1494   return BuildVector;
1495 }
1496
1497
1498 //===----------------------------------------------------------------------===//
1499 // Custom DAG Optimizations
1500 //===----------------------------------------------------------------------===//
1501
1502 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1503                                               DAGCombinerInfo &DCI) const {
1504   SelectionDAG &DAG = DCI.DAG;
1505
1506   switch (N->getOpcode()) {
1507   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1508   case ISD::FP_ROUND: {
1509       SDValue Arg = N->getOperand(0);
1510       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1511         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1512                            Arg.getOperand(0));
1513       }
1514       break;
1515     }
1516
1517   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1518   // (i32 select_cc f32, f32, -1, 0 cc)
1519   //
1520   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1521   // this to one of the SET*_DX10 instructions.
1522   case ISD::FP_TO_SINT: {
1523     SDValue FNeg = N->getOperand(0);
1524     if (FNeg.getOpcode() != ISD::FNEG) {
1525       return SDValue();
1526     }
1527     SDValue SelectCC = FNeg.getOperand(0);
1528     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1529         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1530         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1531         !isHWTrueValue(SelectCC.getOperand(2)) ||
1532         !isHWFalseValue(SelectCC.getOperand(3))) {
1533       return SDValue();
1534     }
1535
1536     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1537                            SelectCC.getOperand(0), // LHS
1538                            SelectCC.getOperand(1), // RHS
1539                            DAG.getConstant(-1, MVT::i32), // True
1540                            DAG.getConstant(0, MVT::i32),  // Flase
1541                            SelectCC.getOperand(4)); // CC
1542
1543     break;
1544   }
1545
1546   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1547   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1548   case ISD::INSERT_VECTOR_ELT: {
1549     SDValue InVec = N->getOperand(0);
1550     SDValue InVal = N->getOperand(1);
1551     SDValue EltNo = N->getOperand(2);
1552     SDLoc dl(N);
1553
1554     // If the inserted element is an UNDEF, just use the input vector.
1555     if (InVal.getOpcode() == ISD::UNDEF)
1556       return InVec;
1557
1558     EVT VT = InVec.getValueType();
1559
1560     // If we can't generate a legal BUILD_VECTOR, exit
1561     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1562       return SDValue();
1563
1564     // Check that we know which element is being inserted
1565     if (!isa<ConstantSDNode>(EltNo))
1566       return SDValue();
1567     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1568
1569     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1570     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1571     // vector elements.
1572     SmallVector<SDValue, 8> Ops;
1573     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1574       Ops.append(InVec.getNode()->op_begin(),
1575                  InVec.getNode()->op_end());
1576     } else if (InVec.getOpcode() == ISD::UNDEF) {
1577       unsigned NElts = VT.getVectorNumElements();
1578       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1579     } else {
1580       return SDValue();
1581     }
1582
1583     // Insert the element
1584     if (Elt < Ops.size()) {
1585       // All the operands of BUILD_VECTOR must have the same type;
1586       // we enforce that here.
1587       EVT OpVT = Ops[0].getValueType();
1588       if (InVal.getValueType() != OpVT)
1589         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1590           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1591           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1592       Ops[Elt] = InVal;
1593     }
1594
1595     // Return the new vector
1596     return DAG.getNode(ISD::BUILD_VECTOR, dl,
1597                        VT, &Ops[0], Ops.size());
1598   }
1599
1600   // Extract_vec (Build_vector) generated by custom lowering
1601   // also needs to be customly combined
1602   case ISD::EXTRACT_VECTOR_ELT: {
1603     SDValue Arg = N->getOperand(0);
1604     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1605       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1606         unsigned Element = Const->getZExtValue();
1607         return Arg->getOperand(Element);
1608       }
1609     }
1610     if (Arg.getOpcode() == ISD::BITCAST &&
1611         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1612       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1613         unsigned Element = Const->getZExtValue();
1614         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1615             Arg->getOperand(0).getOperand(Element));
1616       }
1617     }
1618   }
1619
1620   case ISD::SELECT_CC: {
1621     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1622     //      selectcc x, y, a, b, inv(cc)
1623     //
1624     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1625     //      selectcc x, y, a, b, cc
1626     SDValue LHS = N->getOperand(0);
1627     if (LHS.getOpcode() != ISD::SELECT_CC) {
1628       return SDValue();
1629     }
1630
1631     SDValue RHS = N->getOperand(1);
1632     SDValue True = N->getOperand(2);
1633     SDValue False = N->getOperand(3);
1634     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1635
1636     if (LHS.getOperand(2).getNode() != True.getNode() ||
1637         LHS.getOperand(3).getNode() != False.getNode() ||
1638         RHS.getNode() != False.getNode()) {
1639       return SDValue();
1640     }
1641
1642     switch (NCC) {
1643     default: return SDValue();
1644     case ISD::SETNE: return LHS;
1645     case ISD::SETEQ: {
1646       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1647       LHSCC = ISD::getSetCCInverse(LHSCC,
1648                                   LHS.getOperand(0).getValueType().isInteger());
1649       if (DCI.isBeforeLegalizeOps() ||
1650           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1651         return DAG.getSelectCC(SDLoc(N),
1652                                LHS.getOperand(0),
1653                                LHS.getOperand(1),
1654                                LHS.getOperand(2),
1655                                LHS.getOperand(3),
1656                                LHSCC);
1657       break;
1658     }
1659     }
1660     return SDValue();
1661   }
1662
1663   case AMDGPUISD::EXPORT: {
1664     SDValue Arg = N->getOperand(1);
1665     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1666       break;
1667
1668     SDValue NewArgs[8] = {
1669       N->getOperand(0), // Chain
1670       SDValue(),
1671       N->getOperand(2), // ArrayBase
1672       N->getOperand(3), // Type
1673       N->getOperand(4), // SWZ_X
1674       N->getOperand(5), // SWZ_Y
1675       N->getOperand(6), // SWZ_Z
1676       N->getOperand(7) // SWZ_W
1677     };
1678     SDLoc DL(N);
1679     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1680     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1681   }
1682   case AMDGPUISD::TEXTURE_FETCH: {
1683     SDValue Arg = N->getOperand(1);
1684     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1685       break;
1686
1687     SDValue NewArgs[19] = {
1688       N->getOperand(0),
1689       N->getOperand(1),
1690       N->getOperand(2),
1691       N->getOperand(3),
1692       N->getOperand(4),
1693       N->getOperand(5),
1694       N->getOperand(6),
1695       N->getOperand(7),
1696       N->getOperand(8),
1697       N->getOperand(9),
1698       N->getOperand(10),
1699       N->getOperand(11),
1700       N->getOperand(12),
1701       N->getOperand(13),
1702       N->getOperand(14),
1703       N->getOperand(15),
1704       N->getOperand(16),
1705       N->getOperand(17),
1706       N->getOperand(18),
1707     };
1708     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1709     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1710         NewArgs, 19);
1711   }
1712   }
1713   return SDValue();
1714 }
1715
1716 static bool
1717 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
1718             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
1719   const R600InstrInfo *TII =
1720       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1721   if (!Src.isMachineOpcode())
1722     return false;
1723   switch (Src.getMachineOpcode()) {
1724   case AMDGPU::FNEG_R600:
1725     if (!Neg.getNode())
1726       return false;
1727     Src = Src.getOperand(0);
1728     Neg = DAG.getTargetConstant(1, MVT::i32);
1729     return true;
1730   case AMDGPU::FABS_R600:
1731     if (!Abs.getNode())
1732       return false;
1733     Src = Src.getOperand(0);
1734     Abs = DAG.getTargetConstant(1, MVT::i32);
1735     return true;
1736   case AMDGPU::CONST_COPY: {
1737     unsigned Opcode = ParentNode->getMachineOpcode();
1738     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1739
1740     if (!Sel.getNode())
1741       return false;
1742
1743     SDValue CstOffset = Src.getOperand(0);
1744     if (ParentNode->getValueType(0).isVector())
1745       return false;
1746
1747     // Gather constants values
1748     int SrcIndices[] = {
1749       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1750       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1751       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1752       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1753       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1754       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1755       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1756       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1757       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1758       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1759       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1760     };
1761     std::vector<unsigned> Consts;
1762     for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
1763       int OtherSrcIdx = SrcIndices[i];
1764       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1765       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1766         continue;
1767       if (HasDst) {
1768         OtherSrcIdx--;
1769         OtherSelIdx--;
1770       }
1771       if (RegisterSDNode *Reg =
1772           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1773         if (Reg->getReg() == AMDGPU::ALU_CONST) {
1774           ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
1775               ParentNode->getOperand(OtherSelIdx));
1776           Consts.push_back(Cst->getZExtValue());
1777         }
1778       }
1779     }
1780
1781     ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
1782     Consts.push_back(Cst->getZExtValue());
1783     if (!TII->fitsConstReadLimitations(Consts)) {
1784       return false;
1785     }
1786
1787     Sel = CstOffset;
1788     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
1789     return true;
1790   }
1791   case AMDGPU::MOV_IMM_I32:
1792   case AMDGPU::MOV_IMM_F32: {
1793     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
1794     uint64_t ImmValue = 0;
1795
1796
1797     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
1798       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
1799       float FloatValue = FPC->getValueAPF().convertToFloat();
1800       if (FloatValue == 0.0) {
1801         ImmReg = AMDGPU::ZERO;
1802       } else if (FloatValue == 0.5) {
1803         ImmReg = AMDGPU::HALF;
1804       } else if (FloatValue == 1.0) {
1805         ImmReg = AMDGPU::ONE;
1806       } else {
1807         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
1808       }
1809     } else {
1810       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
1811       uint64_t Value = C->getZExtValue();
1812       if (Value == 0) {
1813         ImmReg = AMDGPU::ZERO;
1814       } else if (Value == 1) {
1815         ImmReg = AMDGPU::ONE_INT;
1816       } else {
1817         ImmValue = Value;
1818       }
1819     }
1820
1821     // Check that we aren't already using an immediate.
1822     // XXX: It's possible for an instruction to have more than one
1823     // immediate operand, but this is not supported yet.
1824     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
1825       if (!Imm.getNode())
1826         return false;
1827       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
1828       assert(C);
1829       if (C->getZExtValue())
1830         return false;
1831       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
1832     }
1833     Src = DAG.getRegister(ImmReg, MVT::i32);
1834     return true;
1835   }
1836   default:
1837     return false;
1838   }
1839 }
1840
1841
1842 /// \brief Fold the instructions after selecting them
1843 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
1844                                             SelectionDAG &DAG) const {
1845   const R600InstrInfo *TII =
1846       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1847   if (!Node->isMachineOpcode())
1848     return Node;
1849   unsigned Opcode = Node->getMachineOpcode();
1850   SDValue FakeOp;
1851
1852   std::vector<SDValue> Ops;
1853   for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
1854               I != E; ++I)
1855           Ops.push_back(*I);
1856
1857   if (Opcode == AMDGPU::DOT_4) {
1858     int OperandIdx[] = {
1859       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1860       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1861       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1862       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1863       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1864       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1865       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1866       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1867         };
1868     int NegIdx[] = {
1869       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
1870       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
1871       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
1872       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
1873       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
1874       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
1875       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
1876       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
1877     };
1878     int AbsIdx[] = {
1879       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
1880       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
1881       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
1882       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
1883       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
1884       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
1885       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
1886       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
1887     };
1888     for (unsigned i = 0; i < 8; i++) {
1889       if (OperandIdx[i] < 0)
1890         return Node;
1891       SDValue &Src = Ops[OperandIdx[i] - 1];
1892       SDValue &Neg = Ops[NegIdx[i] - 1];
1893       SDValue &Abs = Ops[AbsIdx[i] - 1];
1894       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1895       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1896       if (HasDst)
1897         SelIdx--;
1898       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1899       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
1900         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1901     }
1902   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
1903     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
1904       SDValue &Src = Ops[i];
1905       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
1906         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1907     }
1908   } else if (Opcode == AMDGPU::CLAMP_R600) {
1909     SDValue Src = Node->getOperand(0);
1910     if (!Src.isMachineOpcode() ||
1911         !TII->hasInstrModifiers(Src.getMachineOpcode()))
1912       return Node;
1913     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
1914         AMDGPU::OpName::clamp);
1915     if (ClampIdx < 0)
1916       return Node;
1917     std::vector<SDValue> Ops;
1918     unsigned NumOp = Src.getNumOperands();
1919     for(unsigned i = 0; i < NumOp; ++i)
1920           Ops.push_back(Src.getOperand(i));
1921     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
1922     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
1923         Node->getVTList(), Ops);
1924   } else {
1925     if (!TII->hasInstrModifiers(Opcode))
1926       return Node;
1927     int OperandIdx[] = {
1928       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1929       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1930       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
1931     };
1932     int NegIdx[] = {
1933       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
1934       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
1935       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
1936     };
1937     int AbsIdx[] = {
1938       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
1939       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
1940       -1
1941     };
1942     for (unsigned i = 0; i < 3; i++) {
1943       if (OperandIdx[i] < 0)
1944         return Node;
1945       SDValue &Src = Ops[OperandIdx[i] - 1];
1946       SDValue &Neg = Ops[NegIdx[i] - 1];
1947       SDValue FakeAbs;
1948       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
1949       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1950       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1951       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
1952       if (HasDst) {
1953         SelIdx--;
1954         ImmIdx--;
1955       }
1956       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1957       SDValue &Imm = Ops[ImmIdx];
1958       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
1959         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1960     }
1961   }
1962
1963   return Node;
1964 }