lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/MachineFrameInfo.h"
  20 #include "llvm/CodeGen/MachineInstrBuilder.h"
  21 #include "llvm/CodeGen/MachineRegisterInfo.h"
  22 #include "llvm/CodeGen/SelectionDAG.h"
  23 #include "llvm/IR/Argument.h"
  24 #include "llvm/IR/Function.h"
  25
  26 using namespace llvm;
  27
  28 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  29     AMDGPUTargetLowering(TM),
  30     TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
  31   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  32   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  33   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  34   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  35   computeRegisterProperties();
  36
  37   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  38   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  40   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  41
  42   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
  43   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
  44   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
  45   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
  46   setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
  47   setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
  48   setOperationAction(ISD::OR, MVT::v4i32, Expand);
  49   setOperationAction(ISD::OR, MVT::v2i32, Expand);
  50   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
  51   setOperationAction(ISD::SHL, MVT::v4i32, Expand);
  52   setOperationAction(ISD::SHL, MVT::v2i32, Expand);
  53   setOperationAction(ISD::SRL, MVT::v4i32, Expand);
  54   setOperationAction(ISD::SRL, MVT::v2i32, Expand);
  55   setOperationAction(ISD::SRA, MVT::v4i32, Expand);
  56   setOperationAction(ISD::SRA, MVT::v2i32, Expand);
  57   setOperationAction(ISD::SUB, MVT::v4i32, Expand);
  58   setOperationAction(ISD::SUB, MVT::v2i32, Expand);
  59   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
  60   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
  61   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
  62   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  63   setOperationAction(ISD::XOR, MVT::v4i32, Expand);
  64   setOperationAction(ISD::XOR, MVT::v2i32, Expand);
  65
  66   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  67   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  68
  69   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  70
  71   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  72   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  73   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  74
  75   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  76   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  77
  78   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  79   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  80   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  81
  82   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  83   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  84
  85   setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
  86   setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
  87
  88   // Legalize loads and stores to the private address space.
  89   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  90   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  91   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  92   setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
  93   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
  94   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  95   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
  96   setOperationAction(ISD::STORE, MVT::i8, Custom);
  97   setOperationAction(ISD::STORE, MVT::i32, Custom);
  98   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  99   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 100
 101   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 102   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 103   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 104
 105   setTargetDAGCombine(ISD::FP_ROUND);
 106   setTargetDAGCombine(ISD::FP_TO_SINT);
 107   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 108   setTargetDAGCombine(ISD::SELECT_CC);
 109
 110   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 111   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 112   setSchedulingPreference(Sched::VLIW);
 113 }
 114
 115 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 116     MachineInstr * MI, MachineBasicBlock * BB) const {
 117   MachineFunction * MF = BB->getParent();
 118   MachineRegisterInfo &MRI = MF->getRegInfo();
 119   MachineBasicBlock::iterator I = *MI;
 120
 121   switch (MI->getOpcode()) {
 122   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 123   case AMDGPU::CLAMP_R600: {
 124     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 125                                                    AMDGPU::MOV,
 126                                                    MI->getOperand(0).getReg(),
 127                                                    MI->getOperand(1).getReg());
 128     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 129     break;
 130   }
 131
 132   case AMDGPU::FABS_R600: {
 133     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 134                                                     AMDGPU::MOV,
 135                                                     MI->getOperand(0).getReg(),
 136                                                     MI->getOperand(1).getReg());
 137     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 138     break;
 139   }
 140
 141   case AMDGPU::FNEG_R600: {
 142     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 143                                                     AMDGPU::MOV,
 144                                                     MI->getOperand(0).getReg(),
 145                                                     MI->getOperand(1).getReg());
 146     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 147     break;
 148   }
 149
 150   case AMDGPU::MASK_WRITE: {
 151     unsigned maskedRegister = MI->getOperand(0).getReg();
 152     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 153     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 154     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 155     break;
 156   }
 157
 158   case AMDGPU::MOV_IMM_F32:
 159     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 160                      MI->getOperand(1).getFPImm()->getValueAPF()
 161                          .bitcastToAPInt().getZExtValue());
 162     break;
 163   case AMDGPU::MOV_IMM_I32:
 164     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 165                      MI->getOperand(1).getImm());
 166     break;
 167   case AMDGPU::CONST_COPY: {
 168     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 169         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 170     TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
 171         MI->getOperand(1).getImm());
 172     break;
 173   }
 174
 175   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 176   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 177     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 178
 179     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 180             .addOperand(MI->getOperand(0))
 181             .addOperand(MI->getOperand(1))
 182             .addImm(EOP); // Set End of program bit
 183     break;
 184   }
 185
 186   case AMDGPU::TXD: {
 187     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 188     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 189     MachineOperand &RID = MI->getOperand(4);
 190     MachineOperand &SID = MI->getOperand(5);
 191     unsigned TextureId = MI->getOperand(6).getImm();
 192     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 193     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 194
 195     switch (TextureId) {
 196     case 5: // Rect
 197       CTX = CTY = 0;
 198       break;
 199     case 6: // Shadow1D
 200       SrcW = SrcZ;
 201       break;
 202     case 7: // Shadow2D
 203       SrcW = SrcZ;
 204       break;
 205     case 8: // ShadowRect
 206       CTX = CTY = 0;
 207       SrcW = SrcZ;
 208       break;
 209     case 9: // 1DArray
 210       SrcZ = SrcY;
 211       CTZ = 0;
 212       break;
 213     case 10: // 2DArray
 214       CTZ = 0;
 215       break;
 216     case 11: // Shadow1DArray
 217       SrcZ = SrcY;
 218       CTZ = 0;
 219       break;
 220     case 12: // Shadow2DArray
 221       CTZ = 0;
 222       break;
 223     }
 224     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 225             .addOperand(MI->getOperand(3))
 226             .addImm(SrcX)
 227             .addImm(SrcY)
 228             .addImm(SrcZ)
 229             .addImm(SrcW)
 230             .addImm(0)
 231             .addImm(0)
 232             .addImm(0)
 233             .addImm(0)
 234             .addImm(1)
 235             .addImm(2)
 236             .addImm(3)
 237             .addOperand(RID)
 238             .addOperand(SID)
 239             .addImm(CTX)
 240             .addImm(CTY)
 241             .addImm(CTZ)
 242             .addImm(CTW);
 243     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 244             .addOperand(MI->getOperand(2))
 245             .addImm(SrcX)
 246             .addImm(SrcY)
 247             .addImm(SrcZ)
 248             .addImm(SrcW)
 249             .addImm(0)
 250             .addImm(0)
 251             .addImm(0)
 252             .addImm(0)
 253             .addImm(1)
 254             .addImm(2)
 255             .addImm(3)
 256             .addOperand(RID)
 257             .addOperand(SID)
 258             .addImm(CTX)
 259             .addImm(CTY)
 260             .addImm(CTZ)
 261             .addImm(CTW);
 262     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 263             .addOperand(MI->getOperand(0))
 264             .addOperand(MI->getOperand(1))
 265             .addImm(SrcX)
 266             .addImm(SrcY)
 267             .addImm(SrcZ)
 268             .addImm(SrcW)
 269             .addImm(0)
 270             .addImm(0)
 271             .addImm(0)
 272             .addImm(0)
 273             .addImm(1)
 274             .addImm(2)
 275             .addImm(3)
 276             .addOperand(RID)
 277             .addOperand(SID)
 278             .addImm(CTX)
 279             .addImm(CTY)
 280             .addImm(CTZ)
 281             .addImm(CTW)
 282             .addReg(T0, RegState::Implicit)
 283             .addReg(T1, RegState::Implicit);
 284     break;
 285   }
 286
 287   case AMDGPU::TXD_SHADOW: {
 288     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 289     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 290     MachineOperand &RID = MI->getOperand(4);
 291     MachineOperand &SID = MI->getOperand(5);
 292     unsigned TextureId = MI->getOperand(6).getImm();
 293     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 294     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 295
 296     switch (TextureId) {
 297     case 5: // Rect
 298       CTX = CTY = 0;
 299       break;
 300     case 6: // Shadow1D
 301       SrcW = SrcZ;
 302       break;
 303     case 7: // Shadow2D
 304       SrcW = SrcZ;
 305       break;
 306     case 8: // ShadowRect
 307       CTX = CTY = 0;
 308       SrcW = SrcZ;
 309       break;
 310     case 9: // 1DArray
 311       SrcZ = SrcY;
 312       CTZ = 0;
 313       break;
 314     case 10: // 2DArray
 315       CTZ = 0;
 316       break;
 317     case 11: // Shadow1DArray
 318       SrcZ = SrcY;
 319       CTZ = 0;
 320       break;
 321     case 12: // Shadow2DArray
 322       CTZ = 0;
 323       break;
 324     }
 325
 326     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 327             .addOperand(MI->getOperand(3))
 328             .addImm(SrcX)
 329             .addImm(SrcY)
 330             .addImm(SrcZ)
 331             .addImm(SrcW)
 332             .addImm(0)
 333             .addImm(0)
 334             .addImm(0)
 335             .addImm(0)
 336             .addImm(1)
 337             .addImm(2)
 338             .addImm(3)
 339             .addOperand(RID)
 340             .addOperand(SID)
 341             .addImm(CTX)
 342             .addImm(CTY)
 343             .addImm(CTZ)
 344             .addImm(CTW);
 345     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 346             .addOperand(MI->getOperand(2))
 347             .addImm(SrcX)
 348             .addImm(SrcY)
 349             .addImm(SrcZ)
 350             .addImm(SrcW)
 351             .addImm(0)
 352             .addImm(0)
 353             .addImm(0)
 354             .addImm(0)
 355             .addImm(1)
 356             .addImm(2)
 357             .addImm(3)
 358             .addOperand(RID)
 359             .addOperand(SID)
 360             .addImm(CTX)
 361             .addImm(CTY)
 362             .addImm(CTZ)
 363             .addImm(CTW);
 364     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 365             .addOperand(MI->getOperand(0))
 366             .addOperand(MI->getOperand(1))
 367             .addImm(SrcX)
 368             .addImm(SrcY)
 369             .addImm(SrcZ)
 370             .addImm(SrcW)
 371             .addImm(0)
 372             .addImm(0)
 373             .addImm(0)
 374             .addImm(0)
 375             .addImm(1)
 376             .addImm(2)
 377             .addImm(3)
 378             .addOperand(RID)
 379             .addOperand(SID)
 380             .addImm(CTX)
 381             .addImm(CTY)
 382             .addImm(CTZ)
 383             .addImm(CTW)
 384             .addReg(T0, RegState::Implicit)
 385             .addReg(T1, RegState::Implicit);
 386     break;
 387   }
 388
 389   case AMDGPU::BRANCH:
 390       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 391               .addOperand(MI->getOperand(0));
 392       break;
 393
 394   case AMDGPU::BRANCH_COND_f32: {
 395     MachineInstr *NewMI =
 396       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 397               AMDGPU::PREDICATE_BIT)
 398               .addOperand(MI->getOperand(1))
 399               .addImm(OPCODE_IS_NOT_ZERO)
 400               .addImm(0); // Flags
 401     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 402     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 403             .addOperand(MI->getOperand(0))
 404             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 405     break;
 406   }
 407
 408   case AMDGPU::BRANCH_COND_i32: {
 409     MachineInstr *NewMI =
 410       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 411             AMDGPU::PREDICATE_BIT)
 412             .addOperand(MI->getOperand(1))
 413             .addImm(OPCODE_IS_NOT_ZERO_INT)
 414             .addImm(0); // Flags
 415     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 416     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 417            .addOperand(MI->getOperand(0))
 418             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 419     break;
 420   }
 421
 422   case AMDGPU::EG_ExportSwz:
 423   case AMDGPU::R600_ExportSwz: {
 424     // Instruction is left unmodified if its not the last one of its type
 425     bool isLastInstructionOfItsType = true;
 426     unsigned InstExportType = MI->getOperand(1).getImm();
 427     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 428          EndBlock = BB->end(); NextExportInst != EndBlock;
 429          NextExportInst = llvm::next(NextExportInst)) {
 430       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 431           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 432         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 433             .getImm();
 434         if (CurrentInstExportType == InstExportType) {
 435           isLastInstructionOfItsType = false;
 436           break;
 437         }
 438       }
 439     }
 440     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 441     if (!EOP && !isLastInstructionOfItsType)
 442       return BB;
 443     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 444     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 445             .addOperand(MI->getOperand(0))
 446             .addOperand(MI->getOperand(1))
 447             .addOperand(MI->getOperand(2))
 448             .addOperand(MI->getOperand(3))
 449             .addOperand(MI->getOperand(4))
 450             .addOperand(MI->getOperand(5))
 451             .addOperand(MI->getOperand(6))
 452             .addImm(CfInst)
 453             .addImm(EOP);
 454     break;
 455   }
 456   case AMDGPU::RETURN: {
 457     // RETURN instructions must have the live-out registers as implicit uses,
 458     // otherwise they appear dead.
 459     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 460     MachineInstrBuilder MIB(*MF, MI);
 461     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 462       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 463     return BB;
 464   }
 465   }
 466
 467   MI->eraseFromParent();
 468   return BB;
 469 }
 470
 471 //===----------------------------------------------------------------------===//
 472 // Custom DAG Lowering Operations
 473 //===----------------------------------------------------------------------===//
 474
 475 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 476   switch (Op.getOpcode()) {
 477   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 478   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 479   case ISD::SELECT: return LowerSELECT(Op, DAG);
 480   case ISD::STORE: return LowerSTORE(Op, DAG);
 481   case ISD::LOAD: return LowerLOAD(Op, DAG);
 482   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 483   case ISD::INTRINSIC_VOID: {
 484     SDValue Chain = Op.getOperand(0);
 485     unsigned IntrinsicID =
 486                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 487     switch (IntrinsicID) {
 488     case AMDGPUIntrinsic::AMDGPU_store_output: {
 489       MachineFunction &MF = DAG.getMachineFunction();
 490       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 491       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 492       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 493       MFI->LiveOuts.push_back(Reg);
 494       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 495     }
 496     case AMDGPUIntrinsic::R600_store_swizzle: {
 497       const SDValue Args[8] = {
 498         Chain,
 499         Op.getOperand(2), // Export Value
 500         Op.getOperand(3), // ArrayBase
 501         Op.getOperand(4), // Type
 502         DAG.getConstant(0, MVT::i32), // SWZ_X
 503         DAG.getConstant(1, MVT::i32), // SWZ_Y
 504         DAG.getConstant(2, MVT::i32), // SWZ_Z
 505         DAG.getConstant(3, MVT::i32) // SWZ_W
 506       };
 507       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 508           Args, 8);
 509     }
 510
 511     // default for switch(IntrinsicID)
 512     default: break;
 513     }
 514     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 515     break;
 516   }
 517   case ISD::INTRINSIC_WO_CHAIN: {
 518     unsigned IntrinsicID =
 519                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 520     EVT VT = Op.getValueType();
 521     SDLoc DL(Op);
 522     switch(IntrinsicID) {
 523     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 524     case AMDGPUIntrinsic::R600_load_input: {
 525       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 526       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 527       MachineFunction &MF = DAG.getMachineFunction();
 528       MachineRegisterInfo &MRI = MF.getRegInfo();
 529       MRI.addLiveIn(Reg);
 530       return DAG.getCopyFromReg(DAG.getEntryNode(),
 531           SDLoc(DAG.getEntryNode()), Reg, VT);
 532     }
 533
 534     case AMDGPUIntrinsic::R600_interp_input: {
 535       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 536       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 537       MachineSDNode *interp;
 538       if (ijb < 0) {
 539         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 540             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 541         return DAG.getTargetExtractSubreg(
 542             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 543             DL, MVT::f32, SDValue(interp, 0));
 544       }
 545
 546       if (slot % 4 < 2)
 547         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 548             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 549             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 550                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 551             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 552                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 553       else
 554         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 555             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 556             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 557                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 558             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 559                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 560
 561       return SDValue(interp, slot % 2);
 562     }
 563     case AMDGPUIntrinsic::R600_tex:
 564     case AMDGPUIntrinsic::R600_texc:
 565     case AMDGPUIntrinsic::R600_txl:
 566     case AMDGPUIntrinsic::R600_txlc:
 567     case AMDGPUIntrinsic::R600_txb:
 568     case AMDGPUIntrinsic::R600_txbc:
 569     case AMDGPUIntrinsic::R600_txf:
 570     case AMDGPUIntrinsic::R600_txq:
 571     case AMDGPUIntrinsic::R600_ddx:
 572     case AMDGPUIntrinsic::R600_ddy: {
 573       unsigned TextureOp;
 574       switch (IntrinsicID) {
 575       case AMDGPUIntrinsic::R600_tex:
 576         TextureOp = 0;
 577         break;
 578       case AMDGPUIntrinsic::R600_texc:
 579         TextureOp = 1;
 580         break;
 581       case AMDGPUIntrinsic::R600_txl:
 582         TextureOp = 2;
 583         break;
 584       case AMDGPUIntrinsic::R600_txlc:
 585         TextureOp = 3;
 586         break;
 587       case AMDGPUIntrinsic::R600_txb:
 588         TextureOp = 4;
 589         break;
 590       case AMDGPUIntrinsic::R600_txbc:
 591         TextureOp = 5;
 592         break;
 593       case AMDGPUIntrinsic::R600_txf:
 594         TextureOp = 6;
 595         break;
 596       case AMDGPUIntrinsic::R600_txq:
 597         TextureOp = 7;
 598         break;
 599       case AMDGPUIntrinsic::R600_ddx:
 600         TextureOp = 8;
 601         break;
 602       case AMDGPUIntrinsic::R600_ddy:
 603         TextureOp = 9;
 604         break;
 605       default:
 606         llvm_unreachable("Unknow Texture Operation");
 607       }
 608
 609       SDValue TexArgs[19] = {
 610         DAG.getConstant(TextureOp, MVT::i32),
 611         Op.getOperand(1),
 612         DAG.getConstant(0, MVT::i32),
 613         DAG.getConstant(1, MVT::i32),
 614         DAG.getConstant(2, MVT::i32),
 615         DAG.getConstant(3, MVT::i32),
 616         Op.getOperand(2),
 617         Op.getOperand(3),
 618         Op.getOperand(4),
 619         DAG.getConstant(0, MVT::i32),
 620         DAG.getConstant(1, MVT::i32),
 621         DAG.getConstant(2, MVT::i32),
 622         DAG.getConstant(3, MVT::i32),
 623         Op.getOperand(5),
 624         Op.getOperand(6),
 625         Op.getOperand(7),
 626         Op.getOperand(8),
 627         Op.getOperand(9),
 628         Op.getOperand(10)
 629       };
 630       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 631     }
 632     case AMDGPUIntrinsic::AMDGPU_dp4: {
 633       SDValue Args[8] = {
 634       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 635           DAG.getConstant(0, MVT::i32)),
 636       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 637           DAG.getConstant(0, MVT::i32)),
 638       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 639           DAG.getConstant(1, MVT::i32)),
 640       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 641           DAG.getConstant(1, MVT::i32)),
 642       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 643           DAG.getConstant(2, MVT::i32)),
 644       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 645           DAG.getConstant(2, MVT::i32)),
 646       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 647           DAG.getConstant(3, MVT::i32)),
 648       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 649           DAG.getConstant(3, MVT::i32))
 650       };
 651       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 652     }
 653
 654     case Intrinsic::r600_read_ngroups_x:
 655       return LowerImplicitParameter(DAG, VT, DL, 0);
 656     case Intrinsic::r600_read_ngroups_y:
 657       return LowerImplicitParameter(DAG, VT, DL, 1);
 658     case Intrinsic::r600_read_ngroups_z:
 659       return LowerImplicitParameter(DAG, VT, DL, 2);
 660     case Intrinsic::r600_read_global_size_x:
 661       return LowerImplicitParameter(DAG, VT, DL, 3);
 662     case Intrinsic::r600_read_global_size_y:
 663       return LowerImplicitParameter(DAG, VT, DL, 4);
 664     case Intrinsic::r600_read_global_size_z:
 665       return LowerImplicitParameter(DAG, VT, DL, 5);
 666     case Intrinsic::r600_read_local_size_x:
 667       return LowerImplicitParameter(DAG, VT, DL, 6);
 668     case Intrinsic::r600_read_local_size_y:
 669       return LowerImplicitParameter(DAG, VT, DL, 7);
 670     case Intrinsic::r600_read_local_size_z:
 671       return LowerImplicitParameter(DAG, VT, DL, 8);
 672
 673     case Intrinsic::r600_read_tgid_x:
 674       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 675                                   AMDGPU::T1_X, VT);
 676     case Intrinsic::r600_read_tgid_y:
 677       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 678                                   AMDGPU::T1_Y, VT);
 679     case Intrinsic::r600_read_tgid_z:
 680       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 681                                   AMDGPU::T1_Z, VT);
 682     case Intrinsic::r600_read_tidig_x:
 683       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 684                                   AMDGPU::T0_X, VT);
 685     case Intrinsic::r600_read_tidig_y:
 686       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 687                                   AMDGPU::T0_Y, VT);
 688     case Intrinsic::r600_read_tidig_z:
 689       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 690                                   AMDGPU::T0_Z, VT);
 691     }
 692     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 693     break;
 694   }
 695   } // end switch(Op.getOpcode())
 696   return SDValue();
 697 }
 698
 699 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 700                                             SmallVectorImpl<SDValue> &Results,
 701                                             SelectionDAG &DAG) const {
 702   switch (N->getOpcode()) {
 703   default: return;
 704   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 705     return;
 706   case ISD::LOAD: {
 707     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 708     Results.push_back(SDValue(Node, 0));
 709     Results.push_back(SDValue(Node, 1));
 710     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 711     // function
 712     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 713     return;
 714   }
 715   case ISD::STORE:
 716     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 717     Results.push_back(SDValue(Node, 0));
 718     return;
 719   }
 720 }
 721
 722 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 723   return DAG.getNode(
 724       ISD::SETCC,
 725       SDLoc(Op),
 726       MVT::i1,
 727       Op, DAG.getConstantFP(0.0f, MVT::f32),
 728       DAG.getCondCode(ISD::SETNE)
 729       );
 730 }
 731
 732 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 733                                                    SDLoc DL,
 734                                                    unsigned DwordOffset) const {
 735   unsigned ByteOffset = DwordOffset * 4;
 736   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 737                                       AMDGPUAS::PARAM_I_ADDRESS);
 738
 739   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 740   assert(isInt<16>(ByteOffset));
 741
 742   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 743                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 744                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 745                      false, false, false, 0);
 746 }
 747
 748 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 749
 750   MachineFunction &MF = DAG.getMachineFunction();
 751   const AMDGPUFrameLowering *TFL =
 752    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 753
 754   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 755   assert(FIN);
 756
 757   unsigned FrameIndex = FIN->getIndex();
 758   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 759   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 760 }
 761
 762 bool R600TargetLowering::isZero(SDValue Op) const {
 763   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 764     return Cst->isNullValue();
 765   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 766     return CstFP->isZero();
 767   } else {
 768     return false;
 769   }
 770 }
 771
 772 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 773   SDLoc DL(Op);
 774   EVT VT = Op.getValueType();
 775
 776   SDValue LHS = Op.getOperand(0);
 777   SDValue RHS = Op.getOperand(1);
 778   SDValue True = Op.getOperand(2);
 779   SDValue False = Op.getOperand(3);
 780   SDValue CC = Op.getOperand(4);
 781   SDValue Temp;
 782
 783   // LHS and RHS are guaranteed to be the same value type
 784   EVT CompareVT = LHS.getValueType();
 785
 786   // Check if we can lower this to a native operation.
 787
 788   // Try to lower to a SET* instruction:
 789   //
 790   // SET* can match the following patterns:
 791   //
 792   // select_cc f32, f32, -1,  0, cc_any
 793   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 794   // select_cc i32, i32, -1,  0, cc_any
 795   //
 796
 797   // Move hardware True/False values to the correct operand.
 798   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 799     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 800     std::swap(False, True);
 801     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 802   }
 803
 804   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 805       (CompareVT == VT || VT == MVT::i32)) {
 806     // This can be matched by a SET* instruction.
 807     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 808   }
 809
 810   // Try to lower to a CND* instruction:
 811   //
 812   // CND* can match the following patterns:
 813   //
 814   // select_cc f32, 0.0, f32, f32, cc_any
 815   // select_cc f32, 0.0, i32, i32, cc_any
 816   // select_cc i32, 0,   f32, f32, cc_any
 817   // select_cc i32, 0,   i32, i32, cc_any
 818   //
 819   if (isZero(LHS) || isZero(RHS)) {
 820     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 821     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 822     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 823     if (CompareVT != VT) {
 824       // Bitcast True / False to the correct types.  This will end up being
 825       // a nop, but it allows us to define only a single pattern in the
 826       // .TD files for each CND* instruction rather than having to have
 827       // one pattern for integer True/False and one for fp True/False
 828       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 829       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 830     }
 831     if (isZero(LHS)) {
 832       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 833     }
 834
 835     switch (CCOpcode) {
 836     case ISD::SETONE:
 837     case ISD::SETUNE:
 838     case ISD::SETNE:
 839     case ISD::SETULE:
 840     case ISD::SETULT:
 841     case ISD::SETOLE:
 842     case ISD::SETOLT:
 843     case ISD::SETLE:
 844     case ISD::SETLT:
 845       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 846       Temp = True;
 847       True = False;
 848       False = Temp;
 849       break;
 850     default:
 851       break;
 852     }
 853     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 854         Cond, Zero,
 855         True, False,
 856         DAG.getCondCode(CCOpcode));
 857     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 858   }
 859
 860
 861   // Possible Min/Max pattern
 862   SDValue MinMax = LowerMinMax(Op, DAG);
 863   if (MinMax.getNode()) {
 864     return MinMax;
 865   }
 866
 867   // If we make it this for it means we have no native instructions to handle
 868   // this SELECT_CC, so we must lower it.
 869   SDValue HWTrue, HWFalse;
 870
 871   if (CompareVT == MVT::f32) {
 872     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 873     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 874   } else if (CompareVT == MVT::i32) {
 875     HWTrue = DAG.getConstant(-1, CompareVT);
 876     HWFalse = DAG.getConstant(0, CompareVT);
 877   }
 878   else {
 879     assert(!"Unhandled value type in LowerSELECT_CC");
 880   }
 881
 882   // Lower this unsupported SELECT_CC into a combination of two supported
 883   // SELECT_CC operations.
 884   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 885
 886   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 887       Cond, HWFalse,
 888       True, False,
 889       DAG.getCondCode(ISD::SETNE));
 890 }
 891
 892 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 893   return DAG.getNode(ISD::SELECT_CC,
 894       SDLoc(Op),
 895       Op.getValueType(),
 896       Op.getOperand(0),
 897       DAG.getConstant(0, MVT::i32),
 898       Op.getOperand(1),
 899       Op.getOperand(2),
 900       DAG.getCondCode(ISD::SETNE));
 901 }
 902
 903 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 904 /// convert these pointers to a register index.  Each register holds
 905 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 906 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 907 /// for indirect addressing.
 908 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 909                                                unsigned StackWidth,
 910                                                SelectionDAG &DAG) const {
 911   unsigned SRLPad;
 912   switch(StackWidth) {
 913   case 1:
 914     SRLPad = 2;
 915     break;
 916   case 2:
 917     SRLPad = 3;
 918     break;
 919   case 4:
 920     SRLPad = 4;
 921     break;
 922   default: llvm_unreachable("Invalid stack width");
 923   }
 924
 925   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
 926                      DAG.getConstant(SRLPad, MVT::i32));
 927 }
 928
 929 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 930                                          unsigned ElemIdx,
 931                                          unsigned &Channel,
 932                                          unsigned &PtrIncr) const {
 933   switch (StackWidth) {
 934   default:
 935   case 1:
 936     Channel = 0;
 937     if (ElemIdx > 0) {
 938       PtrIncr = 1;
 939     } else {
 940       PtrIncr = 0;
 941     }
 942     break;
 943   case 2:
 944     Channel = ElemIdx % 2;
 945     if (ElemIdx == 2) {
 946       PtrIncr = 1;
 947     } else {
 948       PtrIncr = 0;
 949     }
 950     break;
 951   case 4:
 952     Channel = ElemIdx;
 953     PtrIncr = 0;
 954     break;
 955   }
 956 }
 957
 958 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 959   SDLoc DL(Op);
 960   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 961   SDValue Chain = Op.getOperand(0);
 962   SDValue Value = Op.getOperand(1);
 963   SDValue Ptr = Op.getOperand(2);
 964
 965   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
 966       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
 967     // Convert pointer from byte address to dword address.
 968     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
 969                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
 970                                   Ptr, DAG.getConstant(2, MVT::i32)));
 971
 972     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
 973       assert(!"Truncated and indexed stores not supported yet");
 974     } else {
 975       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
 976     }
 977     return Chain;
 978   }
 979
 980   EVT ValueVT = Value.getValueType();
 981
 982   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
 983     return SDValue();
 984   }
 985
 986   // Lowering for indirect addressing
 987
 988   const MachineFunction &MF = DAG.getMachineFunction();
 989   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
 990                                          getTargetMachine().getFrameLowering());
 991   unsigned StackWidth = TFL->getStackWidth(MF);
 992
 993   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
 994
 995   if (ValueVT.isVector()) {
 996     unsigned NumElemVT = ValueVT.getVectorNumElements();
 997     EVT ElemVT = ValueVT.getVectorElementType();
 998     SDValue Stores[4];
 999
1000     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1001                                       "vector width in load");
1002
1003     for (unsigned i = 0; i < NumElemVT; ++i) {
1004       unsigned Channel, PtrIncr;
1005       getStackAddress(StackWidth, i, Channel, PtrIncr);
1006       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1007                         DAG.getConstant(PtrIncr, MVT::i32));
1008       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1009                                  Value, DAG.getConstant(i, MVT::i32));
1010
1011       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1012                               Chain, Elem, Ptr,
1013                               DAG.getTargetConstant(Channel, MVT::i32));
1014     }
1015      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1016    } else {
1017     if (ValueVT == MVT::i8) {
1018       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1019     }
1020     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1021     DAG.getTargetConstant(0, MVT::i32)); // Channel
1022   }
1023
1024   return Chain;
1025 }
1026
1027 // return (512 + (kc_bank << 12)
1028 static int
1029 ConstantAddressBlock(unsigned AddressSpace) {
1030   switch (AddressSpace) {
1031   case AMDGPUAS::CONSTANT_BUFFER_0:
1032     return 512;
1033   case AMDGPUAS::CONSTANT_BUFFER_1:
1034     return 512 + 4096;
1035   case AMDGPUAS::CONSTANT_BUFFER_2:
1036     return 512 + 4096 * 2;
1037   case AMDGPUAS::CONSTANT_BUFFER_3:
1038     return 512 + 4096 * 3;
1039   case AMDGPUAS::CONSTANT_BUFFER_4:
1040     return 512 + 4096 * 4;
1041   case AMDGPUAS::CONSTANT_BUFFER_5:
1042     return 512 + 4096 * 5;
1043   case AMDGPUAS::CONSTANT_BUFFER_6:
1044     return 512 + 4096 * 6;
1045   case AMDGPUAS::CONSTANT_BUFFER_7:
1046     return 512 + 4096 * 7;
1047   case AMDGPUAS::CONSTANT_BUFFER_8:
1048     return 512 + 4096 * 8;
1049   case AMDGPUAS::CONSTANT_BUFFER_9:
1050     return 512 + 4096 * 9;
1051   case AMDGPUAS::CONSTANT_BUFFER_10:
1052     return 512 + 4096 * 10;
1053   case AMDGPUAS::CONSTANT_BUFFER_11:
1054     return 512 + 4096 * 11;
1055   case AMDGPUAS::CONSTANT_BUFFER_12:
1056     return 512 + 4096 * 12;
1057   case AMDGPUAS::CONSTANT_BUFFER_13:
1058     return 512 + 4096 * 13;
1059   case AMDGPUAS::CONSTANT_BUFFER_14:
1060     return 512 + 4096 * 14;
1061   case AMDGPUAS::CONSTANT_BUFFER_15:
1062     return 512 + 4096 * 15;
1063   default:
1064     return -1;
1065   }
1066 }
1067
1068 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1069 {
1070   EVT VT = Op.getValueType();
1071   SDLoc DL(Op);
1072   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1073   SDValue Chain = Op.getOperand(0);
1074   SDValue Ptr = Op.getOperand(1);
1075   SDValue LoweredLoad;
1076
1077   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1078   if (ConstantBlock > -1) {
1079     SDValue Result;
1080     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1081         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1082         dyn_cast<ConstantSDNode>(Ptr)) {
1083       SDValue Slots[4];
1084       for (unsigned i = 0; i < 4; i++) {
1085         // We want Const position encoded with the following formula :
1086         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1087         // const_index is Ptr computed by llvm using an alignment of 16.
1088         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1089         // then div by 4 at the ISel step
1090         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1091             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1092         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1093       }
1094       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
1095     } else {
1096       // non constant ptr cant be folded, keeps it as a v4f32 load
1097       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1098           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1099           DAG.getConstant(LoadNode->getAddressSpace() -
1100                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1101           );
1102     }
1103
1104     if (!VT.isVector()) {
1105       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1106           DAG.getConstant(0, MVT::i32));
1107     }
1108
1109     SDValue MergedValues[2] = {
1110         Result,
1111         Chain
1112     };
1113     return DAG.getMergeValues(MergedValues, 2, DL);
1114   }
1115
1116   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1117     return SDValue();
1118   }
1119
1120   // Lowering for indirect addressing
1121   const MachineFunction &MF = DAG.getMachineFunction();
1122   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1123                                          getTargetMachine().getFrameLowering());
1124   unsigned StackWidth = TFL->getStackWidth(MF);
1125
1126   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1127
1128   if (VT.isVector()) {
1129     unsigned NumElemVT = VT.getVectorNumElements();
1130     EVT ElemVT = VT.getVectorElementType();
1131     SDValue Loads[4];
1132
1133     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1134                                       "vector width in load");
1135
1136     for (unsigned i = 0; i < NumElemVT; ++i) {
1137       unsigned Channel, PtrIncr;
1138       getStackAddress(StackWidth, i, Channel, PtrIncr);
1139       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1140                         DAG.getConstant(PtrIncr, MVT::i32));
1141       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1142                              Chain, Ptr,
1143                              DAG.getTargetConstant(Channel, MVT::i32),
1144                              Op.getOperand(2));
1145     }
1146     for (unsigned i = NumElemVT; i < 4; ++i) {
1147       Loads[i] = DAG.getUNDEF(ElemVT);
1148     }
1149     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1150     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1151   } else {
1152     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1153                               Chain, Ptr,
1154                               DAG.getTargetConstant(0, MVT::i32), // Channel
1155                               Op.getOperand(2));
1156   }
1157
1158   SDValue Ops[2];
1159   Ops[0] = LoweredLoad;
1160   Ops[1] = Chain;
1161
1162   return DAG.getMergeValues(Ops, 2, DL);
1163 }
1164
1165 /// XXX Only kernel functions are supported, so we can assume for now that
1166 /// every function is a kernel function, but in the future we should use
1167 /// separate calling conventions for kernel and non-kernel functions.
1168 SDValue R600TargetLowering::LowerFormalArguments(
1169                                       SDValue Chain,
1170                                       CallingConv::ID CallConv,
1171                                       bool isVarArg,
1172                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1173                                       SDLoc DL, SelectionDAG &DAG,
1174                                       SmallVectorImpl<SDValue> &InVals) const {
1175   unsigned ParamOffsetBytes = 36;
1176   Function::const_arg_iterator FuncArg =
1177                             DAG.getMachineFunction().getFunction()->arg_begin();
1178   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
1179     EVT VT = Ins[i].VT;
1180     Type *ArgType = FuncArg->getType();
1181     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
1182                              32 : ArgType->getPrimitiveSizeInBits();
1183     unsigned ArgBytes = ArgSizeInBits >> 3;
1184     EVT ArgVT;
1185     if (ArgSizeInBits < VT.getSizeInBits()) {
1186       assert(!ArgType->isFloatTy() &&
1187              "Extending floating point arguments not supported yet");
1188       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
1189     } else {
1190       ArgVT = VT;
1191     }
1192     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1193                                                     AMDGPUAS::PARAM_I_ADDRESS);
1194     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
1195                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
1196                                        MachinePointerInfo(UndefValue::get(PtrTy)),
1197                                        ArgVT, false, false, ArgBytes);
1198     InVals.push_back(Arg);
1199     ParamOffsetBytes += ArgBytes;
1200   }
1201   return Chain;
1202 }
1203
1204 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1205    if (!VT.isVector()) return MVT::i32;
1206    return VT.changeVectorElementTypeToInteger();
1207 }
1208
1209 //===----------------------------------------------------------------------===//
1210 // Custom DAG Optimizations
1211 //===----------------------------------------------------------------------===//
1212
1213 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1214                                               DAGCombinerInfo &DCI) const {
1215   SelectionDAG &DAG = DCI.DAG;
1216
1217   switch (N->getOpcode()) {
1218   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1219   case ISD::FP_ROUND: {
1220       SDValue Arg = N->getOperand(0);
1221       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1222         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1223                            Arg.getOperand(0));
1224       }
1225       break;
1226     }
1227
1228   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1229   // (i32 select_cc f32, f32, -1, 0 cc)
1230   //
1231   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1232   // this to one of the SET*_DX10 instructions.
1233   case ISD::FP_TO_SINT: {
1234     SDValue FNeg = N->getOperand(0);
1235     if (FNeg.getOpcode() != ISD::FNEG) {
1236       return SDValue();
1237     }
1238     SDValue SelectCC = FNeg.getOperand(0);
1239     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1240         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1241         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1242         !isHWTrueValue(SelectCC.getOperand(2)) ||
1243         !isHWFalseValue(SelectCC.getOperand(3))) {
1244       return SDValue();
1245     }
1246
1247     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1248                            SelectCC.getOperand(0), // LHS
1249                            SelectCC.getOperand(1), // RHS
1250                            DAG.getConstant(-1, MVT::i32), // True
1251                            DAG.getConstant(0, MVT::i32),  // Flase
1252                            SelectCC.getOperand(4)); // CC
1253
1254     break;
1255   }
1256   // Extract_vec (Build_vector) generated by custom lowering
1257   // also needs to be customly combined
1258   case ISD::EXTRACT_VECTOR_ELT: {
1259     SDValue Arg = N->getOperand(0);
1260     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1261       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1262         unsigned Element = Const->getZExtValue();
1263         return Arg->getOperand(Element);
1264       }
1265     }
1266     if (Arg.getOpcode() == ISD::BITCAST &&
1267         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1268       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1269         unsigned Element = Const->getZExtValue();
1270         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1271             Arg->getOperand(0).getOperand(Element));
1272       }
1273     }
1274   }
1275
1276   case ISD::SELECT_CC: {
1277     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1278     //      selectcc x, y, a, b, inv(cc)
1279     //
1280     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1281     //      selectcc x, y, a, b, cc
1282     SDValue LHS = N->getOperand(0);
1283     if (LHS.getOpcode() != ISD::SELECT_CC) {
1284       return SDValue();
1285     }
1286
1287     SDValue RHS = N->getOperand(1);
1288     SDValue True = N->getOperand(2);
1289     SDValue False = N->getOperand(3);
1290     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1291
1292     if (LHS.getOperand(2).getNode() != True.getNode() ||
1293         LHS.getOperand(3).getNode() != False.getNode() ||
1294         RHS.getNode() != False.getNode()) {
1295       return SDValue();
1296     }
1297
1298     switch (NCC) {
1299     default: return SDValue();
1300     case ISD::SETNE: return LHS;
1301     case ISD::SETEQ: {
1302       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1303       LHSCC = ISD::getSetCCInverse(LHSCC,
1304                                   LHS.getOperand(0).getValueType().isInteger());
1305       return DAG.getSelectCC(SDLoc(N),
1306                              LHS.getOperand(0),
1307                              LHS.getOperand(1),
1308                              LHS.getOperand(2),
1309                              LHS.getOperand(3),
1310                              LHSCC);
1311     }
1312     }
1313   }
1314   case AMDGPUISD::EXPORT: {
1315     SDValue Arg = N->getOperand(1);
1316     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1317       break;
1318     SDValue NewBldVec[4] = {
1319         DAG.getUNDEF(MVT::f32),
1320         DAG.getUNDEF(MVT::f32),
1321         DAG.getUNDEF(MVT::f32),
1322         DAG.getUNDEF(MVT::f32)
1323       };
1324     SDValue NewArgs[8] = {
1325       N->getOperand(0), // Chain
1326       SDValue(),
1327       N->getOperand(2), // ArrayBase
1328       N->getOperand(3), // Type
1329       N->getOperand(4), // SWZ_X
1330       N->getOperand(5), // SWZ_Y
1331       N->getOperand(6), // SWZ_Z
1332       N->getOperand(7) // SWZ_W
1333     };
1334     for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
1335       if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
1336         if (C->isZero()) {
1337           NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
1338         } else if (C->isExactlyValue(1.0)) {
1339           NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
1340         } else {
1341           NewBldVec[i] = Arg.getOperand(i);
1342         }
1343       } else {
1344         NewBldVec[i] = Arg.getOperand(i);
1345       }
1346     }
1347     SDLoc DL(N);
1348     NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
1349     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1350   }
1351   }
1352   return SDValue();
1353 }