lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/MachineFrameInfo.h"
  20 #include "llvm/CodeGen/MachineInstrBuilder.h"
  21 #include "llvm/CodeGen/MachineRegisterInfo.h"
  22 #include "llvm/CodeGen/SelectionDAG.h"
  23 #include "llvm/IR/Argument.h"
  24 #include "llvm/IR/Function.h"
  25
  26 using namespace llvm;
  27
  28 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  29     AMDGPUTargetLowering(TM),
  30     TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
  31   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  32   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  33   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  34   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  35   computeRegisterProperties();
  36
  37   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  38   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  40   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  41
  42   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
  43   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
  44   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
  45   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
  46   setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
  47   setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
  48   setOperationAction(ISD::OR, MVT::v4i32, Expand);
  49   setOperationAction(ISD::OR, MVT::v2i32, Expand);
  50   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
  51   setOperationAction(ISD::SHL, MVT::v4i32, Expand);
  52   setOperationAction(ISD::SHL, MVT::v2i32, Expand);
  53   setOperationAction(ISD::SRL, MVT::v4i32, Expand);
  54   setOperationAction(ISD::SRL, MVT::v2i32, Expand);
  55   setOperationAction(ISD::SRA, MVT::v4i32, Expand);
  56   setOperationAction(ISD::SRA, MVT::v2i32, Expand);
  57   setOperationAction(ISD::SUB, MVT::v4i32, Expand);
  58   setOperationAction(ISD::SUB, MVT::v2i32, Expand);
  59   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
  60   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
  61   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
  62   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  63   setOperationAction(ISD::XOR, MVT::v4i32, Expand);
  64   setOperationAction(ISD::XOR, MVT::v2i32, Expand);
  65
  66   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  67   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  68
  69   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  70
  71   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  72   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  73   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  74
  75   setOperationAction(ISD::ROTL, MVT::i32, Custom);
  76
  77   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  78   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  79
  80   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  81   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  82   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  83
  84   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  85   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  86
  87   setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
  88   setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
  89
  90   // Legalize loads and stores to the private address space.
  91   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  92   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  93   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  94   setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
  95   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
  96   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  97   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
  98   setOperationAction(ISD::STORE, MVT::i8, Custom);
  99   setOperationAction(ISD::STORE, MVT::i32, Custom);
 100   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 101   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 102
 103   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 104   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 105   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 106
 107   setTargetDAGCombine(ISD::FP_ROUND);
 108   setTargetDAGCombine(ISD::FP_TO_SINT);
 109   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 110   setTargetDAGCombine(ISD::SELECT_CC);
 111
 112   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 113   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 114   setSchedulingPreference(Sched::VLIW);
 115 }
 116
 117 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 118     MachineInstr * MI, MachineBasicBlock * BB) const {
 119   MachineFunction * MF = BB->getParent();
 120   MachineRegisterInfo &MRI = MF->getRegInfo();
 121   MachineBasicBlock::iterator I = *MI;
 122
 123   switch (MI->getOpcode()) {
 124   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 125   case AMDGPU::CLAMP_R600: {
 126     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 127                                                    AMDGPU::MOV,
 128                                                    MI->getOperand(0).getReg(),
 129                                                    MI->getOperand(1).getReg());
 130     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 131     break;
 132   }
 133
 134   case AMDGPU::FABS_R600: {
 135     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 136                                                     AMDGPU::MOV,
 137                                                     MI->getOperand(0).getReg(),
 138                                                     MI->getOperand(1).getReg());
 139     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 140     break;
 141   }
 142
 143   case AMDGPU::FNEG_R600: {
 144     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 145                                                     AMDGPU::MOV,
 146                                                     MI->getOperand(0).getReg(),
 147                                                     MI->getOperand(1).getReg());
 148     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 149     break;
 150   }
 151
 152   case AMDGPU::MASK_WRITE: {
 153     unsigned maskedRegister = MI->getOperand(0).getReg();
 154     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 155     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 156     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 157     break;
 158   }
 159
 160   case AMDGPU::MOV_IMM_F32:
 161     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 162                      MI->getOperand(1).getFPImm()->getValueAPF()
 163                          .bitcastToAPInt().getZExtValue());
 164     break;
 165   case AMDGPU::MOV_IMM_I32:
 166     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 167                      MI->getOperand(1).getImm());
 168     break;
 169   case AMDGPU::CONST_COPY: {
 170     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 171         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 172     TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
 173         MI->getOperand(1).getImm());
 174     break;
 175   }
 176
 177   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 178   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 179     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 180
 181     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 182             .addOperand(MI->getOperand(0))
 183             .addOperand(MI->getOperand(1))
 184             .addImm(EOP); // Set End of program bit
 185     break;
 186   }
 187
 188   case AMDGPU::TXD: {
 189     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 190     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 191     MachineOperand &RID = MI->getOperand(4);
 192     MachineOperand &SID = MI->getOperand(5);
 193     unsigned TextureId = MI->getOperand(6).getImm();
 194     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 195     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 196
 197     switch (TextureId) {
 198     case 5: // Rect
 199       CTX = CTY = 0;
 200       break;
 201     case 6: // Shadow1D
 202       SrcW = SrcZ;
 203       break;
 204     case 7: // Shadow2D
 205       SrcW = SrcZ;
 206       break;
 207     case 8: // ShadowRect
 208       CTX = CTY = 0;
 209       SrcW = SrcZ;
 210       break;
 211     case 9: // 1DArray
 212       SrcZ = SrcY;
 213       CTZ = 0;
 214       break;
 215     case 10: // 2DArray
 216       CTZ = 0;
 217       break;
 218     case 11: // Shadow1DArray
 219       SrcZ = SrcY;
 220       CTZ = 0;
 221       break;
 222     case 12: // Shadow2DArray
 223       CTZ = 0;
 224       break;
 225     }
 226     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 227             .addOperand(MI->getOperand(3))
 228             .addImm(SrcX)
 229             .addImm(SrcY)
 230             .addImm(SrcZ)
 231             .addImm(SrcW)
 232             .addImm(0)
 233             .addImm(0)
 234             .addImm(0)
 235             .addImm(0)
 236             .addImm(1)
 237             .addImm(2)
 238             .addImm(3)
 239             .addOperand(RID)
 240             .addOperand(SID)
 241             .addImm(CTX)
 242             .addImm(CTY)
 243             .addImm(CTZ)
 244             .addImm(CTW);
 245     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 246             .addOperand(MI->getOperand(2))
 247             .addImm(SrcX)
 248             .addImm(SrcY)
 249             .addImm(SrcZ)
 250             .addImm(SrcW)
 251             .addImm(0)
 252             .addImm(0)
 253             .addImm(0)
 254             .addImm(0)
 255             .addImm(1)
 256             .addImm(2)
 257             .addImm(3)
 258             .addOperand(RID)
 259             .addOperand(SID)
 260             .addImm(CTX)
 261             .addImm(CTY)
 262             .addImm(CTZ)
 263             .addImm(CTW);
 264     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 265             .addOperand(MI->getOperand(0))
 266             .addOperand(MI->getOperand(1))
 267             .addImm(SrcX)
 268             .addImm(SrcY)
 269             .addImm(SrcZ)
 270             .addImm(SrcW)
 271             .addImm(0)
 272             .addImm(0)
 273             .addImm(0)
 274             .addImm(0)
 275             .addImm(1)
 276             .addImm(2)
 277             .addImm(3)
 278             .addOperand(RID)
 279             .addOperand(SID)
 280             .addImm(CTX)
 281             .addImm(CTY)
 282             .addImm(CTZ)
 283             .addImm(CTW)
 284             .addReg(T0, RegState::Implicit)
 285             .addReg(T1, RegState::Implicit);
 286     break;
 287   }
 288
 289   case AMDGPU::TXD_SHADOW: {
 290     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 291     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 292     MachineOperand &RID = MI->getOperand(4);
 293     MachineOperand &SID = MI->getOperand(5);
 294     unsigned TextureId = MI->getOperand(6).getImm();
 295     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 296     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 297
 298     switch (TextureId) {
 299     case 5: // Rect
 300       CTX = CTY = 0;
 301       break;
 302     case 6: // Shadow1D
 303       SrcW = SrcZ;
 304       break;
 305     case 7: // Shadow2D
 306       SrcW = SrcZ;
 307       break;
 308     case 8: // ShadowRect
 309       CTX = CTY = 0;
 310       SrcW = SrcZ;
 311       break;
 312     case 9: // 1DArray
 313       SrcZ = SrcY;
 314       CTZ = 0;
 315       break;
 316     case 10: // 2DArray
 317       CTZ = 0;
 318       break;
 319     case 11: // Shadow1DArray
 320       SrcZ = SrcY;
 321       CTZ = 0;
 322       break;
 323     case 12: // Shadow2DArray
 324       CTZ = 0;
 325       break;
 326     }
 327
 328     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 329             .addOperand(MI->getOperand(3))
 330             .addImm(SrcX)
 331             .addImm(SrcY)
 332             .addImm(SrcZ)
 333             .addImm(SrcW)
 334             .addImm(0)
 335             .addImm(0)
 336             .addImm(0)
 337             .addImm(0)
 338             .addImm(1)
 339             .addImm(2)
 340             .addImm(3)
 341             .addOperand(RID)
 342             .addOperand(SID)
 343             .addImm(CTX)
 344             .addImm(CTY)
 345             .addImm(CTZ)
 346             .addImm(CTW);
 347     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 348             .addOperand(MI->getOperand(2))
 349             .addImm(SrcX)
 350             .addImm(SrcY)
 351             .addImm(SrcZ)
 352             .addImm(SrcW)
 353             .addImm(0)
 354             .addImm(0)
 355             .addImm(0)
 356             .addImm(0)
 357             .addImm(1)
 358             .addImm(2)
 359             .addImm(3)
 360             .addOperand(RID)
 361             .addOperand(SID)
 362             .addImm(CTX)
 363             .addImm(CTY)
 364             .addImm(CTZ)
 365             .addImm(CTW);
 366     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 367             .addOperand(MI->getOperand(0))
 368             .addOperand(MI->getOperand(1))
 369             .addImm(SrcX)
 370             .addImm(SrcY)
 371             .addImm(SrcZ)
 372             .addImm(SrcW)
 373             .addImm(0)
 374             .addImm(0)
 375             .addImm(0)
 376             .addImm(0)
 377             .addImm(1)
 378             .addImm(2)
 379             .addImm(3)
 380             .addOperand(RID)
 381             .addOperand(SID)
 382             .addImm(CTX)
 383             .addImm(CTY)
 384             .addImm(CTZ)
 385             .addImm(CTW)
 386             .addReg(T0, RegState::Implicit)
 387             .addReg(T1, RegState::Implicit);
 388     break;
 389   }
 390
 391   case AMDGPU::BRANCH:
 392       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 393               .addOperand(MI->getOperand(0));
 394       break;
 395
 396   case AMDGPU::BRANCH_COND_f32: {
 397     MachineInstr *NewMI =
 398       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 399               AMDGPU::PREDICATE_BIT)
 400               .addOperand(MI->getOperand(1))
 401               .addImm(OPCODE_IS_NOT_ZERO)
 402               .addImm(0); // Flags
 403     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 404     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 405             .addOperand(MI->getOperand(0))
 406             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 407     break;
 408   }
 409
 410   case AMDGPU::BRANCH_COND_i32: {
 411     MachineInstr *NewMI =
 412       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 413             AMDGPU::PREDICATE_BIT)
 414             .addOperand(MI->getOperand(1))
 415             .addImm(OPCODE_IS_NOT_ZERO_INT)
 416             .addImm(0); // Flags
 417     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 418     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 419            .addOperand(MI->getOperand(0))
 420             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 421     break;
 422   }
 423
 424   case AMDGPU::EG_ExportSwz:
 425   case AMDGPU::R600_ExportSwz: {
 426     // Instruction is left unmodified if its not the last one of its type
 427     bool isLastInstructionOfItsType = true;
 428     unsigned InstExportType = MI->getOperand(1).getImm();
 429     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 430          EndBlock = BB->end(); NextExportInst != EndBlock;
 431          NextExportInst = llvm::next(NextExportInst)) {
 432       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 433           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 434         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 435             .getImm();
 436         if (CurrentInstExportType == InstExportType) {
 437           isLastInstructionOfItsType = false;
 438           break;
 439         }
 440       }
 441     }
 442     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 443     if (!EOP && !isLastInstructionOfItsType)
 444       return BB;
 445     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 446     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 447             .addOperand(MI->getOperand(0))
 448             .addOperand(MI->getOperand(1))
 449             .addOperand(MI->getOperand(2))
 450             .addOperand(MI->getOperand(3))
 451             .addOperand(MI->getOperand(4))
 452             .addOperand(MI->getOperand(5))
 453             .addOperand(MI->getOperand(6))
 454             .addImm(CfInst)
 455             .addImm(EOP);
 456     break;
 457   }
 458   case AMDGPU::RETURN: {
 459     // RETURN instructions must have the live-out registers as implicit uses,
 460     // otherwise they appear dead.
 461     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 462     MachineInstrBuilder MIB(*MF, MI);
 463     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 464       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 465     return BB;
 466   }
 467   }
 468
 469   MI->eraseFromParent();
 470   return BB;
 471 }
 472
 473 //===----------------------------------------------------------------------===//
 474 // Custom DAG Lowering Operations
 475 //===----------------------------------------------------------------------===//
 476
 477 using namespace llvm::Intrinsic;
 478 using namespace llvm::AMDGPUIntrinsic;
 479
 480 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 481   switch (Op.getOpcode()) {
 482   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 483   case ISD::ROTL: return LowerROTL(Op, DAG);
 484   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 485   case ISD::SELECT: return LowerSELECT(Op, DAG);
 486   case ISD::STORE: return LowerSTORE(Op, DAG);
 487   case ISD::LOAD: return LowerLOAD(Op, DAG);
 488   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 489   case ISD::INTRINSIC_VOID: {
 490     SDValue Chain = Op.getOperand(0);
 491     unsigned IntrinsicID =
 492                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 493     switch (IntrinsicID) {
 494     case AMDGPUIntrinsic::AMDGPU_store_output: {
 495       MachineFunction &MF = DAG.getMachineFunction();
 496       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 497       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 498       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 499       MFI->LiveOuts.push_back(Reg);
 500       return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
 501     }
 502     case AMDGPUIntrinsic::R600_store_swizzle: {
 503       const SDValue Args[8] = {
 504         Chain,
 505         Op.getOperand(2), // Export Value
 506         Op.getOperand(3), // ArrayBase
 507         Op.getOperand(4), // Type
 508         DAG.getConstant(0, MVT::i32), // SWZ_X
 509         DAG.getConstant(1, MVT::i32), // SWZ_Y
 510         DAG.getConstant(2, MVT::i32), // SWZ_Z
 511         DAG.getConstant(3, MVT::i32) // SWZ_W
 512       };
 513       return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(),
 514           Args, 8);
 515     }
 516
 517     // default for switch(IntrinsicID)
 518     default: break;
 519     }
 520     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 521     break;
 522   }
 523   case ISD::INTRINSIC_WO_CHAIN: {
 524     unsigned IntrinsicID =
 525                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 526     EVT VT = Op.getValueType();
 527     DebugLoc DL = Op.getDebugLoc();
 528     switch(IntrinsicID) {
 529     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 530     case AMDGPUIntrinsic::R600_load_input: {
 531       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 532       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 533       MachineFunction &MF = DAG.getMachineFunction();
 534       MachineRegisterInfo &MRI = MF.getRegInfo();
 535       MRI.addLiveIn(Reg);
 536       return DAG.getCopyFromReg(DAG.getEntryNode(),
 537           DAG.getEntryNode().getDebugLoc(), Reg, VT);
 538     }
 539
 540     case AMDGPUIntrinsic::R600_interp_input: {
 541       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 542       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 543       MachineSDNode *interp;
 544       if (ijb < 0) {
 545         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 546             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 547         return DAG.getTargetExtractSubreg(
 548             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 549             DL, MVT::f32, SDValue(interp, 0));
 550       }
 551
 552       if (slot % 4 < 2)
 553         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 554             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 555             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 556                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 557             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 558                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 559       else
 560         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 561             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 562             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 563                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 564             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 565                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 566
 567       return SDValue(interp, slot % 2);
 568     }
 569     case AMDGPUIntrinsic::R600_tex:
 570     case AMDGPUIntrinsic::R600_texc:
 571     case AMDGPUIntrinsic::R600_txl:
 572     case AMDGPUIntrinsic::R600_txlc:
 573     case AMDGPUIntrinsic::R600_txb:
 574     case AMDGPUIntrinsic::R600_txbc:
 575     case AMDGPUIntrinsic::R600_txf:
 576     case AMDGPUIntrinsic::R600_txq:
 577     case AMDGPUIntrinsic::R600_ddx:
 578     case AMDGPUIntrinsic::R600_ddy: {
 579       unsigned TextureOp;
 580       switch (IntrinsicID) {
 581       case AMDGPUIntrinsic::R600_tex:
 582         TextureOp = 0;
 583         break;
 584       case AMDGPUIntrinsic::R600_texc:
 585         TextureOp = 1;
 586         break;
 587       case AMDGPUIntrinsic::R600_txl:
 588         TextureOp = 2;
 589         break;
 590       case AMDGPUIntrinsic::R600_txlc:
 591         TextureOp = 3;
 592         break;
 593       case AMDGPUIntrinsic::R600_txb:
 594         TextureOp = 4;
 595         break;
 596       case AMDGPUIntrinsic::R600_txbc:
 597         TextureOp = 5;
 598         break;
 599       case AMDGPUIntrinsic::R600_txf:
 600         TextureOp = 6;
 601         break;
 602       case AMDGPUIntrinsic::R600_txq:
 603         TextureOp = 7;
 604         break;
 605       case AMDGPUIntrinsic::R600_ddx:
 606         TextureOp = 8;
 607         break;
 608       case AMDGPUIntrinsic::R600_ddy:
 609         TextureOp = 9;
 610         break;
 611       default:
 612         llvm_unreachable("Unknow Texture Operation");
 613       }
 614
 615       SDValue TexArgs[19] = {
 616         DAG.getConstant(TextureOp, MVT::i32),
 617         Op.getOperand(1),
 618         DAG.getConstant(0, MVT::i32),
 619         DAG.getConstant(1, MVT::i32),
 620         DAG.getConstant(2, MVT::i32),
 621         DAG.getConstant(3, MVT::i32),
 622         Op.getOperand(2),
 623         Op.getOperand(3),
 624         Op.getOperand(4),
 625         DAG.getConstant(0, MVT::i32),
 626         DAG.getConstant(1, MVT::i32),
 627         DAG.getConstant(2, MVT::i32),
 628         DAG.getConstant(3, MVT::i32),
 629         Op.getOperand(5),
 630         Op.getOperand(6),
 631         Op.getOperand(7),
 632         Op.getOperand(8),
 633         Op.getOperand(9),
 634         Op.getOperand(10)
 635       };
 636       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 637     }
 638     case AMDGPUIntrinsic::AMDGPU_dp4: {
 639       SDValue Args[8] = {
 640       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 641           DAG.getConstant(0, MVT::i32)),
 642       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 643           DAG.getConstant(0, MVT::i32)),
 644       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 645           DAG.getConstant(1, MVT::i32)),
 646       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 647           DAG.getConstant(1, MVT::i32)),
 648       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 649           DAG.getConstant(2, MVT::i32)),
 650       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 651           DAG.getConstant(2, MVT::i32)),
 652       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 653           DAG.getConstant(3, MVT::i32)),
 654       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 655           DAG.getConstant(3, MVT::i32))
 656       };
 657       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 658     }
 659
 660     case r600_read_ngroups_x:
 661       return LowerImplicitParameter(DAG, VT, DL, 0);
 662     case r600_read_ngroups_y:
 663       return LowerImplicitParameter(DAG, VT, DL, 1);
 664     case r600_read_ngroups_z:
 665       return LowerImplicitParameter(DAG, VT, DL, 2);
 666     case r600_read_global_size_x:
 667       return LowerImplicitParameter(DAG, VT, DL, 3);
 668     case r600_read_global_size_y:
 669       return LowerImplicitParameter(DAG, VT, DL, 4);
 670     case r600_read_global_size_z:
 671       return LowerImplicitParameter(DAG, VT, DL, 5);
 672     case r600_read_local_size_x:
 673       return LowerImplicitParameter(DAG, VT, DL, 6);
 674     case r600_read_local_size_y:
 675       return LowerImplicitParameter(DAG, VT, DL, 7);
 676     case r600_read_local_size_z:
 677       return LowerImplicitParameter(DAG, VT, DL, 8);
 678
 679     case r600_read_tgid_x:
 680       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 681                                   AMDGPU::T1_X, VT);
 682     case r600_read_tgid_y:
 683       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 684                                   AMDGPU::T1_Y, VT);
 685     case r600_read_tgid_z:
 686       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 687                                   AMDGPU::T1_Z, VT);
 688     case r600_read_tidig_x:
 689       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 690                                   AMDGPU::T0_X, VT);
 691     case r600_read_tidig_y:
 692       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 693                                   AMDGPU::T0_Y, VT);
 694     case r600_read_tidig_z:
 695       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 696                                   AMDGPU::T0_Z, VT);
 697     }
 698     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 699     break;
 700   }
 701   } // end switch(Op.getOpcode())
 702   return SDValue();
 703 }
 704
 705 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 706                                             SmallVectorImpl<SDValue> &Results,
 707                                             SelectionDAG &DAG) const {
 708   switch (N->getOpcode()) {
 709   default: return;
 710   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 711     return;
 712   case ISD::LOAD: {
 713     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 714     Results.push_back(SDValue(Node, 0));
 715     Results.push_back(SDValue(Node, 1));
 716     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 717     // function
 718     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 719     return;
 720   }
 721   case ISD::STORE:
 722     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 723     Results.push_back(SDValue(Node, 0));
 724     return;
 725   }
 726 }
 727
 728 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 729   return DAG.getNode(
 730       ISD::SETCC,
 731       Op.getDebugLoc(),
 732       MVT::i1,
 733       Op, DAG.getConstantFP(0.0f, MVT::f32),
 734       DAG.getCondCode(ISD::SETNE)
 735       );
 736 }
 737
 738 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 739                                                    DebugLoc DL,
 740                                                    unsigned DwordOffset) const {
 741   unsigned ByteOffset = DwordOffset * 4;
 742   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 743                                       AMDGPUAS::PARAM_I_ADDRESS);
 744
 745   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 746   assert(isInt<16>(ByteOffset));
 747
 748   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 749                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 750                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 751                      false, false, false, 0);
 752 }
 753
 754 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 755
 756   MachineFunction &MF = DAG.getMachineFunction();
 757   const AMDGPUFrameLowering *TFL =
 758    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 759
 760   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 761   assert(FIN);
 762
 763   unsigned FrameIndex = FIN->getIndex();
 764   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 765   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 766 }
 767
 768 SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
 769   DebugLoc DL = Op.getDebugLoc();
 770   EVT VT = Op.getValueType();
 771
 772   return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
 773                      Op.getOperand(0),
 774                      Op.getOperand(0),
 775                      DAG.getNode(ISD::SUB, DL, VT,
 776                                  DAG.getConstant(32, MVT::i32),
 777                                  Op.getOperand(1)));
 778 }
 779
 780 bool R600TargetLowering::isZero(SDValue Op) const {
 781   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 782     return Cst->isNullValue();
 783   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 784     return CstFP->isZero();
 785   } else {
 786     return false;
 787   }
 788 }
 789
 790 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 791   DebugLoc DL = Op.getDebugLoc();
 792   EVT VT = Op.getValueType();
 793
 794   SDValue LHS = Op.getOperand(0);
 795   SDValue RHS = Op.getOperand(1);
 796   SDValue True = Op.getOperand(2);
 797   SDValue False = Op.getOperand(3);
 798   SDValue CC = Op.getOperand(4);
 799   SDValue Temp;
 800
 801   // LHS and RHS are guaranteed to be the same value type
 802   EVT CompareVT = LHS.getValueType();
 803
 804   // Check if we can lower this to a native operation.
 805
 806   // Try to lower to a SET* instruction:
 807   //
 808   // SET* can match the following patterns:
 809   //
 810   // select_cc f32, f32, -1,  0, cc_any
 811   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 812   // select_cc i32, i32, -1,  0, cc_any
 813   //
 814
 815   // Move hardware True/False values to the correct operand.
 816   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 817     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 818     std::swap(False, True);
 819     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 820   }
 821
 822   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 823       (CompareVT == VT || VT == MVT::i32)) {
 824     // This can be matched by a SET* instruction.
 825     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 826   }
 827
 828   // Try to lower to a CND* instruction:
 829   //
 830   // CND* can match the following patterns:
 831   //
 832   // select_cc f32, 0.0, f32, f32, cc_any
 833   // select_cc f32, 0.0, i32, i32, cc_any
 834   // select_cc i32, 0,   f32, f32, cc_any
 835   // select_cc i32, 0,   i32, i32, cc_any
 836   //
 837   if (isZero(LHS) || isZero(RHS)) {
 838     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 839     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 840     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 841     if (CompareVT != VT) {
 842       // Bitcast True / False to the correct types.  This will end up being
 843       // a nop, but it allows us to define only a single pattern in the
 844       // .TD files for each CND* instruction rather than having to have
 845       // one pattern for integer True/False and one for fp True/False
 846       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 847       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 848     }
 849     if (isZero(LHS)) {
 850       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 851     }
 852
 853     switch (CCOpcode) {
 854     case ISD::SETONE:
 855     case ISD::SETUNE:
 856     case ISD::SETNE:
 857     case ISD::SETULE:
 858     case ISD::SETULT:
 859     case ISD::SETOLE:
 860     case ISD::SETOLT:
 861     case ISD::SETLE:
 862     case ISD::SETLT:
 863       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 864       Temp = True;
 865       True = False;
 866       False = Temp;
 867       break;
 868     default:
 869       break;
 870     }
 871     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 872         Cond, Zero,
 873         True, False,
 874         DAG.getCondCode(CCOpcode));
 875     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 876   }
 877
 878
 879   // Possible Min/Max pattern
 880   SDValue MinMax = LowerMinMax(Op, DAG);
 881   if (MinMax.getNode()) {
 882     return MinMax;
 883   }
 884
 885   // If we make it this for it means we have no native instructions to handle
 886   // this SELECT_CC, so we must lower it.
 887   SDValue HWTrue, HWFalse;
 888
 889   if (CompareVT == MVT::f32) {
 890     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 891     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 892   } else if (CompareVT == MVT::i32) {
 893     HWTrue = DAG.getConstant(-1, CompareVT);
 894     HWFalse = DAG.getConstant(0, CompareVT);
 895   }
 896   else {
 897     assert(!"Unhandled value type in LowerSELECT_CC");
 898   }
 899
 900   // Lower this unsupported SELECT_CC into a combination of two supported
 901   // SELECT_CC operations.
 902   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 903
 904   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 905       Cond, HWFalse,
 906       True, False,
 907       DAG.getCondCode(ISD::SETNE));
 908 }
 909
 910 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 911   return DAG.getNode(ISD::SELECT_CC,
 912       Op.getDebugLoc(),
 913       Op.getValueType(),
 914       Op.getOperand(0),
 915       DAG.getConstant(0, MVT::i32),
 916       Op.getOperand(1),
 917       Op.getOperand(2),
 918       DAG.getCondCode(ISD::SETNE));
 919 }
 920
 921 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 922 /// convert these pointers to a register index.  Each register holds
 923 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 924 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 925 /// for indirect addressing.
 926 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 927                                                unsigned StackWidth,
 928                                                SelectionDAG &DAG) const {
 929   unsigned SRLPad;
 930   switch(StackWidth) {
 931   case 1:
 932     SRLPad = 2;
 933     break;
 934   case 2:
 935     SRLPad = 3;
 936     break;
 937   case 4:
 938     SRLPad = 4;
 939     break;
 940   default: llvm_unreachable("Invalid stack width");
 941   }
 942
 943   return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
 944                      DAG.getConstant(SRLPad, MVT::i32));
 945 }
 946
 947 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 948                                          unsigned ElemIdx,
 949                                          unsigned &Channel,
 950                                          unsigned &PtrIncr) const {
 951   switch (StackWidth) {
 952   default:
 953   case 1:
 954     Channel = 0;
 955     if (ElemIdx > 0) {
 956       PtrIncr = 1;
 957     } else {
 958       PtrIncr = 0;
 959     }
 960     break;
 961   case 2:
 962     Channel = ElemIdx % 2;
 963     if (ElemIdx == 2) {
 964       PtrIncr = 1;
 965     } else {
 966       PtrIncr = 0;
 967     }
 968     break;
 969   case 4:
 970     Channel = ElemIdx;
 971     PtrIncr = 0;
 972     break;
 973   }
 974 }
 975
 976 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 977   DebugLoc DL = Op.getDebugLoc();
 978   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 979   SDValue Chain = Op.getOperand(0);
 980   SDValue Value = Op.getOperand(1);
 981   SDValue Ptr = Op.getOperand(2);
 982
 983   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
 984       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
 985     // Convert pointer from byte address to dword address.
 986     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
 987                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
 988                                   Ptr, DAG.getConstant(2, MVT::i32)));
 989
 990     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
 991       assert(!"Truncated and indexed stores not supported yet");
 992     } else {
 993       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
 994     }
 995     return Chain;
 996   }
 997
 998   EVT ValueVT = Value.getValueType();
 999
1000   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1001     return SDValue();
1002   }
1003
1004   // Lowering for indirect addressing
1005
1006   const MachineFunction &MF = DAG.getMachineFunction();
1007   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1008                                          getTargetMachine().getFrameLowering());
1009   unsigned StackWidth = TFL->getStackWidth(MF);
1010
1011   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1012
1013   if (ValueVT.isVector()) {
1014     unsigned NumElemVT = ValueVT.getVectorNumElements();
1015     EVT ElemVT = ValueVT.getVectorElementType();
1016     SDValue Stores[4];
1017
1018     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1019                                       "vector width in load");
1020
1021     for (unsigned i = 0; i < NumElemVT; ++i) {
1022       unsigned Channel, PtrIncr;
1023       getStackAddress(StackWidth, i, Channel, PtrIncr);
1024       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1025                         DAG.getConstant(PtrIncr, MVT::i32));
1026       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1027                                  Value, DAG.getConstant(i, MVT::i32));
1028
1029       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1030                               Chain, Elem, Ptr,
1031                               DAG.getTargetConstant(Channel, MVT::i32));
1032     }
1033      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1034    } else {
1035     if (ValueVT == MVT::i8) {
1036       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1037     }
1038     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1039     DAG.getTargetConstant(0, MVT::i32)); // Channel
1040   }
1041
1042   return Chain;
1043 }
1044
1045 // return (512 + (kc_bank << 12)
1046 static int
1047 ConstantAddressBlock(unsigned AddressSpace) {
1048   switch (AddressSpace) {
1049   case AMDGPUAS::CONSTANT_BUFFER_0:
1050     return 512;
1051   case AMDGPUAS::CONSTANT_BUFFER_1:
1052     return 512 + 4096;
1053   case AMDGPUAS::CONSTANT_BUFFER_2:
1054     return 512 + 4096 * 2;
1055   case AMDGPUAS::CONSTANT_BUFFER_3:
1056     return 512 + 4096 * 3;
1057   case AMDGPUAS::CONSTANT_BUFFER_4:
1058     return 512 + 4096 * 4;
1059   case AMDGPUAS::CONSTANT_BUFFER_5:
1060     return 512 + 4096 * 5;
1061   case AMDGPUAS::CONSTANT_BUFFER_6:
1062     return 512 + 4096 * 6;
1063   case AMDGPUAS::CONSTANT_BUFFER_7:
1064     return 512 + 4096 * 7;
1065   case AMDGPUAS::CONSTANT_BUFFER_8:
1066     return 512 + 4096 * 8;
1067   case AMDGPUAS::CONSTANT_BUFFER_9:
1068     return 512 + 4096 * 9;
1069   case AMDGPUAS::CONSTANT_BUFFER_10:
1070     return 512 + 4096 * 10;
1071   case AMDGPUAS::CONSTANT_BUFFER_11:
1072     return 512 + 4096 * 11;
1073   case AMDGPUAS::CONSTANT_BUFFER_12:
1074     return 512 + 4096 * 12;
1075   case AMDGPUAS::CONSTANT_BUFFER_13:
1076     return 512 + 4096 * 13;
1077   case AMDGPUAS::CONSTANT_BUFFER_14:
1078     return 512 + 4096 * 14;
1079   case AMDGPUAS::CONSTANT_BUFFER_15:
1080     return 512 + 4096 * 15;
1081   default:
1082     return -1;
1083   }
1084 }
1085
1086 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1087 {
1088   EVT VT = Op.getValueType();
1089   DebugLoc DL = Op.getDebugLoc();
1090   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1091   SDValue Chain = Op.getOperand(0);
1092   SDValue Ptr = Op.getOperand(1);
1093   SDValue LoweredLoad;
1094
1095   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1096   if (ConstantBlock > -1) {
1097     SDValue Result;
1098     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1099         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1100         dyn_cast<ConstantSDNode>(Ptr)) {
1101       SDValue Slots[4];
1102       for (unsigned i = 0; i < 4; i++) {
1103         // We want Const position encoded with the following formula :
1104         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1105         // const_index is Ptr computed by llvm using an alignment of 16.
1106         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1107         // then div by 4 at the ISel step
1108         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1109             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1110         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1111       }
1112       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
1113     } else {
1114       // non constant ptr cant be folded, keeps it as a v4f32 load
1115       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1116           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1117           DAG.getConstant(LoadNode->getAddressSpace() -
1118                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1119           );
1120     }
1121
1122     if (!VT.isVector()) {
1123       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1124           DAG.getConstant(0, MVT::i32));
1125     }
1126
1127     SDValue MergedValues[2] = {
1128         Result,
1129         Chain
1130     };
1131     return DAG.getMergeValues(MergedValues, 2, DL);
1132   }
1133
1134   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1135     return SDValue();
1136   }
1137
1138   // Lowering for indirect addressing
1139   const MachineFunction &MF = DAG.getMachineFunction();
1140   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1141                                          getTargetMachine().getFrameLowering());
1142   unsigned StackWidth = TFL->getStackWidth(MF);
1143
1144   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1145
1146   if (VT.isVector()) {
1147     unsigned NumElemVT = VT.getVectorNumElements();
1148     EVT ElemVT = VT.getVectorElementType();
1149     SDValue Loads[4];
1150
1151     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1152                                       "vector width in load");
1153
1154     for (unsigned i = 0; i < NumElemVT; ++i) {
1155       unsigned Channel, PtrIncr;
1156       getStackAddress(StackWidth, i, Channel, PtrIncr);
1157       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1158                         DAG.getConstant(PtrIncr, MVT::i32));
1159       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1160                              Chain, Ptr,
1161                              DAG.getTargetConstant(Channel, MVT::i32),
1162                              Op.getOperand(2));
1163     }
1164     for (unsigned i = NumElemVT; i < 4; ++i) {
1165       Loads[i] = DAG.getUNDEF(ElemVT);
1166     }
1167     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1168     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1169   } else {
1170     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1171                               Chain, Ptr,
1172                               DAG.getTargetConstant(0, MVT::i32), // Channel
1173                               Op.getOperand(2));
1174   }
1175
1176   SDValue Ops[2];
1177   Ops[0] = LoweredLoad;
1178   Ops[1] = Chain;
1179
1180   return DAG.getMergeValues(Ops, 2, DL);
1181 }
1182
1183 /// XXX Only kernel functions are supported, so we can assume for now that
1184 /// every function is a kernel function, but in the future we should use
1185 /// separate calling conventions for kernel and non-kernel functions.
1186 SDValue R600TargetLowering::LowerFormalArguments(
1187                                       SDValue Chain,
1188                                       CallingConv::ID CallConv,
1189                                       bool isVarArg,
1190                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1191                                       DebugLoc DL, SelectionDAG &DAG,
1192                                       SmallVectorImpl<SDValue> &InVals) const {
1193   unsigned ParamOffsetBytes = 36;
1194   Function::const_arg_iterator FuncArg =
1195                             DAG.getMachineFunction().getFunction()->arg_begin();
1196   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
1197     EVT VT = Ins[i].VT;
1198     Type *ArgType = FuncArg->getType();
1199     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
1200                              32 : ArgType->getPrimitiveSizeInBits();
1201     unsigned ArgBytes = ArgSizeInBits >> 3;
1202     EVT ArgVT;
1203     if (ArgSizeInBits < VT.getSizeInBits()) {
1204       assert(!ArgType->isFloatTy() &&
1205              "Extending floating point arguments not supported yet");
1206       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
1207     } else {
1208       ArgVT = VT;
1209     }
1210     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1211                                                     AMDGPUAS::PARAM_I_ADDRESS);
1212     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
1213                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
1214                                        MachinePointerInfo(UndefValue::get(PtrTy)),
1215                                        ArgVT, false, false, ArgBytes);
1216     InVals.push_back(Arg);
1217     ParamOffsetBytes += ArgBytes;
1218   }
1219   return Chain;
1220 }
1221
1222 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1223    if (!VT.isVector()) return MVT::i32;
1224    return VT.changeVectorElementTypeToInteger();
1225 }
1226
1227 //===----------------------------------------------------------------------===//
1228 // Custom DAG Optimizations
1229 //===----------------------------------------------------------------------===//
1230
1231 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1232                                               DAGCombinerInfo &DCI) const {
1233   SelectionDAG &DAG = DCI.DAG;
1234
1235   switch (N->getOpcode()) {
1236   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1237   case ISD::FP_ROUND: {
1238       SDValue Arg = N->getOperand(0);
1239       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1240         return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
1241                            Arg.getOperand(0));
1242       }
1243       break;
1244     }
1245
1246   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1247   // (i32 select_cc f32, f32, -1, 0 cc)
1248   //
1249   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1250   // this to one of the SET*_DX10 instructions.
1251   case ISD::FP_TO_SINT: {
1252     SDValue FNeg = N->getOperand(0);
1253     if (FNeg.getOpcode() != ISD::FNEG) {
1254       return SDValue();
1255     }
1256     SDValue SelectCC = FNeg.getOperand(0);
1257     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1258         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1259         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1260         !isHWTrueValue(SelectCC.getOperand(2)) ||
1261         !isHWFalseValue(SelectCC.getOperand(3))) {
1262       return SDValue();
1263     }
1264
1265     return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
1266                            SelectCC.getOperand(0), // LHS
1267                            SelectCC.getOperand(1), // RHS
1268                            DAG.getConstant(-1, MVT::i32), // True
1269                            DAG.getConstant(0, MVT::i32),  // Flase
1270                            SelectCC.getOperand(4)); // CC
1271
1272     break;
1273   }
1274   // Extract_vec (Build_vector) generated by custom lowering
1275   // also needs to be customly combined
1276   case ISD::EXTRACT_VECTOR_ELT: {
1277     SDValue Arg = N->getOperand(0);
1278     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1279       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1280         unsigned Element = Const->getZExtValue();
1281         return Arg->getOperand(Element);
1282       }
1283     }
1284     if (Arg.getOpcode() == ISD::BITCAST &&
1285         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1286       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1287         unsigned Element = Const->getZExtValue();
1288         return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
1289             Arg->getOperand(0).getOperand(Element));
1290       }
1291     }
1292   }
1293
1294   case ISD::SELECT_CC: {
1295     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1296     //      selectcc x, y, a, b, inv(cc)
1297     //
1298     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1299     //      selectcc x, y, a, b, cc
1300     SDValue LHS = N->getOperand(0);
1301     if (LHS.getOpcode() != ISD::SELECT_CC) {
1302       return SDValue();
1303     }
1304
1305     SDValue RHS = N->getOperand(1);
1306     SDValue True = N->getOperand(2);
1307     SDValue False = N->getOperand(3);
1308     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1309
1310     if (LHS.getOperand(2).getNode() != True.getNode() ||
1311         LHS.getOperand(3).getNode() != False.getNode() ||
1312         RHS.getNode() != False.getNode()) {
1313       return SDValue();
1314     }
1315
1316     switch (NCC) {
1317     default: return SDValue();
1318     case ISD::SETNE: return LHS;
1319     case ISD::SETEQ: {
1320       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1321       LHSCC = ISD::getSetCCInverse(LHSCC,
1322                                   LHS.getOperand(0).getValueType().isInteger());
1323       return DAG.getSelectCC(N->getDebugLoc(),
1324                              LHS.getOperand(0),
1325                              LHS.getOperand(1),
1326                              LHS.getOperand(2),
1327                              LHS.getOperand(3),
1328                              LHSCC);
1329     }
1330     }
1331   }
1332   case AMDGPUISD::EXPORT: {
1333     SDValue Arg = N->getOperand(1);
1334     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1335       break;
1336     SDValue NewBldVec[4] = {
1337         DAG.getUNDEF(MVT::f32),
1338         DAG.getUNDEF(MVT::f32),
1339         DAG.getUNDEF(MVT::f32),
1340         DAG.getUNDEF(MVT::f32)
1341       };
1342     SDValue NewArgs[8] = {
1343       N->getOperand(0), // Chain
1344       SDValue(),
1345       N->getOperand(2), // ArrayBase
1346       N->getOperand(3), // Type
1347       N->getOperand(4), // SWZ_X
1348       N->getOperand(5), // SWZ_Y
1349       N->getOperand(6), // SWZ_Z
1350       N->getOperand(7) // SWZ_W
1351     };
1352     for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
1353       if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
1354         if (C->isZero()) {
1355           NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
1356         } else if (C->isExactlyValue(1.0)) {
1357           NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
1358         } else {
1359           NewBldVec[i] = Arg.getOperand(i);
1360         }
1361       } else {
1362         NewBldVec[i] = Arg.getOperand(i);
1363       }
1364     }
1365     DebugLoc DL = N->getDebugLoc();
1366     NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
1367     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1368   }
1369   }
1370   return SDValue();
1371 }