lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/MachineFrameInfo.h"
  20 #include "llvm/CodeGen/MachineInstrBuilder.h"
  21 #include "llvm/CodeGen/MachineRegisterInfo.h"
  22 #include "llvm/CodeGen/SelectionDAG.h"
  23 #include "llvm/IR/Argument.h"
  24 #include "llvm/IR/Function.h"
  25
  26 using namespace llvm;
  27
  28 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  29     AMDGPUTargetLowering(TM),
  30     TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
  31   setOperationAction(ISD::MUL, MVT::i64, Expand);
  32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  36   computeRegisterProperties();
  37
  38   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  40   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  41   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  42
  43   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
  44   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
  45   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
  46   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
  47   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
  48   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
  49   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
  50   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
  51   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  52
  53   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
  54   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
  55
  56   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  57
  58   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  59   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  60   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  61   setOperationAction(ISD::FPOW, MVT::f32, Custom);
  62
  63   setOperationAction(ISD::ROTL, MVT::i32, Custom);
  64
  65   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  66   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  67
  68   setOperationAction(ISD::SETCC, MVT::i32, Custom);
  69   setOperationAction(ISD::SETCC, MVT::f32, Custom);
  70   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  71
  72   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  73   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  74
  75   // Legalize loads and stores to the private address space.
  76   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  77   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  78   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  79   setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
  80   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
  81   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  82   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
  83   setOperationAction(ISD::STORE, MVT::i8, Custom);
  84   setOperationAction(ISD::STORE, MVT::i32, Custom);
  85   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  86   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  87
  88   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  89   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  90   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
  91
  92   setTargetDAGCombine(ISD::FP_ROUND);
  93   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  94
  95   setSchedulingPreference(Sched::VLIW);
  96 }
  97
  98 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
  99     MachineInstr * MI, MachineBasicBlock * BB) const {
 100   MachineFunction * MF = BB->getParent();
 101   MachineRegisterInfo &MRI = MF->getRegInfo();
 102   MachineBasicBlock::iterator I = *MI;
 103
 104   switch (MI->getOpcode()) {
 105   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 106   case AMDGPU::SHADER_TYPE: break;
 107   case AMDGPU::CLAMP_R600: {
 108     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 109                                                    AMDGPU::MOV,
 110                                                    MI->getOperand(0).getReg(),
 111                                                    MI->getOperand(1).getReg());
 112     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 113     break;
 114   }
 115
 116   case AMDGPU::FABS_R600: {
 117     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 118                                                     AMDGPU::MOV,
 119                                                     MI->getOperand(0).getReg(),
 120                                                     MI->getOperand(1).getReg());
 121     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 122     break;
 123   }
 124
 125   case AMDGPU::FNEG_R600: {
 126     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 127                                                     AMDGPU::MOV,
 128                                                     MI->getOperand(0).getReg(),
 129                                                     MI->getOperand(1).getReg());
 130     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 131     break;
 132   }
 133
 134   case AMDGPU::MASK_WRITE: {
 135     unsigned maskedRegister = MI->getOperand(0).getReg();
 136     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 137     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 138     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 139     break;
 140   }
 141
 142   case AMDGPU::MOV_IMM_F32:
 143     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 144                      MI->getOperand(1).getFPImm()->getValueAPF()
 145                          .bitcastToAPInt().getZExtValue());
 146     break;
 147   case AMDGPU::MOV_IMM_I32:
 148     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 149                      MI->getOperand(1).getImm());
 150     break;
 151
 152
 153   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 154   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 155     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 156
 157     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 158             .addOperand(MI->getOperand(0))
 159             .addOperand(MI->getOperand(1))
 160             .addImm(EOP); // Set End of program bit
 161     break;
 162   }
 163
 164   case AMDGPU::TXD: {
 165     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 166     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 167
 168     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 169             .addOperand(MI->getOperand(3))
 170             .addOperand(MI->getOperand(4))
 171             .addOperand(MI->getOperand(5))
 172             .addOperand(MI->getOperand(6));
 173     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 174             .addOperand(MI->getOperand(2))
 175             .addOperand(MI->getOperand(4))
 176             .addOperand(MI->getOperand(5))
 177             .addOperand(MI->getOperand(6));
 178     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 179             .addOperand(MI->getOperand(0))
 180             .addOperand(MI->getOperand(1))
 181             .addOperand(MI->getOperand(4))
 182             .addOperand(MI->getOperand(5))
 183             .addOperand(MI->getOperand(6))
 184             .addReg(T0, RegState::Implicit)
 185             .addReg(T1, RegState::Implicit);
 186     break;
 187   }
 188
 189   case AMDGPU::TXD_SHADOW: {
 190     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 191     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 192
 193     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 194             .addOperand(MI->getOperand(3))
 195             .addOperand(MI->getOperand(4))
 196             .addOperand(MI->getOperand(5))
 197             .addOperand(MI->getOperand(6));
 198     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 199             .addOperand(MI->getOperand(2))
 200             .addOperand(MI->getOperand(4))
 201             .addOperand(MI->getOperand(5))
 202             .addOperand(MI->getOperand(6));
 203     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 204             .addOperand(MI->getOperand(0))
 205             .addOperand(MI->getOperand(1))
 206             .addOperand(MI->getOperand(4))
 207             .addOperand(MI->getOperand(5))
 208             .addOperand(MI->getOperand(6))
 209             .addReg(T0, RegState::Implicit)
 210             .addReg(T1, RegState::Implicit);
 211     break;
 212   }
 213
 214   case AMDGPU::BRANCH:
 215       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 216               .addOperand(MI->getOperand(0))
 217               .addReg(0);
 218       break;
 219
 220   case AMDGPU::BRANCH_COND_f32: {
 221     MachineInstr *NewMI =
 222       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 223               AMDGPU::PREDICATE_BIT)
 224               .addOperand(MI->getOperand(1))
 225               .addImm(OPCODE_IS_NOT_ZERO)
 226               .addImm(0); // Flags
 227     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 228     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 229             .addOperand(MI->getOperand(0))
 230             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 231     break;
 232   }
 233
 234   case AMDGPU::BRANCH_COND_i32: {
 235     MachineInstr *NewMI =
 236       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 237             AMDGPU::PREDICATE_BIT)
 238             .addOperand(MI->getOperand(1))
 239             .addImm(OPCODE_IS_NOT_ZERO_INT)
 240             .addImm(0); // Flags
 241     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 242     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 243            .addOperand(MI->getOperand(0))
 244             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 245     break;
 246   }
 247
 248   case AMDGPU::EG_ExportSwz:
 249   case AMDGPU::R600_ExportSwz: {
 250     // Instruction is left unmodified if its not the last one of its type
 251     bool isLastInstructionOfItsType = true;
 252     unsigned InstExportType = MI->getOperand(1).getImm();
 253     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 254          EndBlock = BB->end(); NextExportInst != EndBlock;
 255          NextExportInst = llvm::next(NextExportInst)) {
 256       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 257           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 258         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 259             .getImm();
 260         if (CurrentInstExportType == InstExportType) {
 261           isLastInstructionOfItsType = false;
 262           break;
 263         }
 264       }
 265     }
 266     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 267     if (!EOP && !isLastInstructionOfItsType)
 268       return BB;
 269     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 270     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 271             .addOperand(MI->getOperand(0))
 272             .addOperand(MI->getOperand(1))
 273             .addOperand(MI->getOperand(2))
 274             .addOperand(MI->getOperand(3))
 275             .addOperand(MI->getOperand(4))
 276             .addOperand(MI->getOperand(5))
 277             .addOperand(MI->getOperand(6))
 278             .addImm(CfInst)
 279             .addImm(EOP);
 280     break;
 281   }
 282   case AMDGPU::RETURN: {
 283     // RETURN instructions must have the live-out registers as implicit uses,
 284     // otherwise they appear dead.
 285     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 286     MachineInstrBuilder MIB(*MF, MI);
 287     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 288       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 289     return BB;
 290   }
 291   }
 292
 293   MI->eraseFromParent();
 294   return BB;
 295 }
 296
 297 //===----------------------------------------------------------------------===//
 298 // Custom DAG Lowering Operations
 299 //===----------------------------------------------------------------------===//
 300
 301 using namespace llvm::Intrinsic;
 302 using namespace llvm::AMDGPUIntrinsic;
 303
 304 static SDValue
 305 InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
 306     unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
 307     SDValue Scalar, SDValue Chain) {
 308   if (!ExportMap[Slot]) {
 309     SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
 310       DL, MVT::v4f32,
 311       DAG.getUNDEF(MVT::v4f32),
 312       Scalar,
 313       DAG.getConstant(Channel, MVT::i32));
 314
 315     unsigned Mask = 1 << Channel;
 316
 317     const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
 318         DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
 319         DAG.getConstant(Mask, MVT::i32)};
 320
 321     SDValue Res =  DAG.getNode(
 322         AMDGPUISD::EXPORT,
 323         DL,
 324         MVT::Other,
 325         Ops, 6);
 326      ExportMap[Slot] = Res.getNode();
 327      return Res;
 328   }
 329
 330   SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
 331   SDValue PreviousVector = ExportInstruction->getOperand(1);
 332   SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
 333       DL, MVT::v4f32,
 334       PreviousVector,
 335       Scalar,
 336       DAG.getConstant(Channel, MVT::i32));
 337
 338   unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
 339       ->getZExtValue();
 340   Mask |= (1 << Channel);
 341
 342   const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
 343       DAG.getConstant(Inst, MVT::i32),
 344       DAG.getConstant(Type, MVT::i32),
 345       DAG.getConstant(Slot, MVT::i32),
 346       DAG.getConstant(Mask, MVT::i32)};
 347
 348   DAG.UpdateNodeOperands(ExportInstruction,
 349       Ops, 6);
 350
 351   return Chain;
 352
 353 }
 354
 355 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 356   switch (Op.getOpcode()) {
 357   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 358   case ISD::BR_CC: return LowerBR_CC(Op, DAG);
 359   case ISD::ROTL: return LowerROTL(Op, DAG);
 360   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 361   case ISD::SELECT: return LowerSELECT(Op, DAG);
 362   case ISD::SETCC: return LowerSETCC(Op, DAG);
 363   case ISD::STORE: return LowerSTORE(Op, DAG);
 364   case ISD::LOAD: return LowerLOAD(Op, DAG);
 365   case ISD::FPOW: return LowerFPOW(Op, DAG);
 366   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 367   case ISD::INTRINSIC_VOID: {
 368     SDValue Chain = Op.getOperand(0);
 369     unsigned IntrinsicID =
 370                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 371     switch (IntrinsicID) {
 372     case AMDGPUIntrinsic::AMDGPU_store_output: {
 373       MachineFunction &MF = DAG.getMachineFunction();
 374       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 375       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 376       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 377       MFI->LiveOuts.push_back(Reg);
 378       return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
 379     }
 380     case AMDGPUIntrinsic::R600_store_pixel_color: {
 381       MachineFunction &MF = DAG.getMachineFunction();
 382       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 383       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 384
 385       SDNode **OutputsMap = MFI->Outputs;
 386       return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
 387           RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
 388           Chain);
 389
 390     }
 391
 392     // default for switch(IntrinsicID)
 393     default: break;
 394     }
 395     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 396     break;
 397   }
 398   case ISD::INTRINSIC_WO_CHAIN: {
 399     unsigned IntrinsicID =
 400                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 401     EVT VT = Op.getValueType();
 402     DebugLoc DL = Op.getDebugLoc();
 403     switch(IntrinsicID) {
 404     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 405     case AMDGPUIntrinsic::R600_load_input: {
 406       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 407       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 408       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
 409     }
 410
 411     case AMDGPUIntrinsic::R600_interp_input: {
 412       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 413       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 414       MachineSDNode *interp;
 415       if (ijb < 0) {
 416         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 417             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 418         return DAG.getTargetExtractSubreg(
 419             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 420             DL, MVT::f32, SDValue(interp, 0));
 421       }
 422
 423       if (slot % 4 < 2)
 424         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 425             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 426             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 427                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 428             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 429                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 430       else
 431         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 432             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 433             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 434                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 435             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 436                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 437
 438       return SDValue(interp, slot % 2);
 439     }
 440
 441     case r600_read_ngroups_x:
 442       return LowerImplicitParameter(DAG, VT, DL, 0);
 443     case r600_read_ngroups_y:
 444       return LowerImplicitParameter(DAG, VT, DL, 1);
 445     case r600_read_ngroups_z:
 446       return LowerImplicitParameter(DAG, VT, DL, 2);
 447     case r600_read_global_size_x:
 448       return LowerImplicitParameter(DAG, VT, DL, 3);
 449     case r600_read_global_size_y:
 450       return LowerImplicitParameter(DAG, VT, DL, 4);
 451     case r600_read_global_size_z:
 452       return LowerImplicitParameter(DAG, VT, DL, 5);
 453     case r600_read_local_size_x:
 454       return LowerImplicitParameter(DAG, VT, DL, 6);
 455     case r600_read_local_size_y:
 456       return LowerImplicitParameter(DAG, VT, DL, 7);
 457     case r600_read_local_size_z:
 458       return LowerImplicitParameter(DAG, VT, DL, 8);
 459
 460     case r600_read_tgid_x:
 461       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 462                                   AMDGPU::T1_X, VT);
 463     case r600_read_tgid_y:
 464       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 465                                   AMDGPU::T1_Y, VT);
 466     case r600_read_tgid_z:
 467       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 468                                   AMDGPU::T1_Z, VT);
 469     case r600_read_tidig_x:
 470       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 471                                   AMDGPU::T0_X, VT);
 472     case r600_read_tidig_y:
 473       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 474                                   AMDGPU::T0_Y, VT);
 475     case r600_read_tidig_z:
 476       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 477                                   AMDGPU::T0_Z, VT);
 478     }
 479     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 480     break;
 481   }
 482   } // end switch(Op.getOpcode())
 483   return SDValue();
 484 }
 485
 486 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 487                                             SmallVectorImpl<SDValue> &Results,
 488                                             SelectionDAG &DAG) const {
 489   switch (N->getOpcode()) {
 490   default: return;
 491   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 492     return;
 493   case ISD::LOAD: {
 494     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 495     Results.push_back(SDValue(Node, 0));
 496     Results.push_back(SDValue(Node, 1));
 497     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 498     // function
 499     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 500     return;
 501   }
 502   case ISD::STORE:
 503     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 504     Results.push_back(SDValue(Node, 0));
 505     return;
 506   }
 507 }
 508
 509 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 510   return DAG.getNode(
 511       ISD::SETCC,
 512       Op.getDebugLoc(),
 513       MVT::i1,
 514       Op, DAG.getConstantFP(0.0f, MVT::f32),
 515       DAG.getCondCode(ISD::SETNE)
 516       );
 517 }
 518
 519 SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 520   SDValue Chain = Op.getOperand(0);
 521   SDValue CC = Op.getOperand(1);
 522   SDValue LHS   = Op.getOperand(2);
 523   SDValue RHS   = Op.getOperand(3);
 524   SDValue JumpT  = Op.getOperand(4);
 525   SDValue CmpValue;
 526   SDValue Result;
 527
 528   if (LHS.getValueType() == MVT::i32) {
 529     CmpValue = DAG.getNode(
 530         ISD::SELECT_CC,
 531         Op.getDebugLoc(),
 532         MVT::i32,
 533         LHS, RHS,
 534         DAG.getConstant(-1, MVT::i32),
 535         DAG.getConstant(0, MVT::i32),
 536         CC);
 537   } else if (LHS.getValueType() == MVT::f32) {
 538     CmpValue = DAG.getNode(
 539         ISD::SELECT_CC,
 540         Op.getDebugLoc(),
 541         MVT::f32,
 542         LHS, RHS,
 543         DAG.getConstantFP(1.0f, MVT::f32),
 544         DAG.getConstantFP(0.0f, MVT::f32),
 545         CC);
 546   } else {
 547     assert(0 && "Not valid type for br_cc");
 548   }
 549   Result = DAG.getNode(
 550       AMDGPUISD::BRANCH_COND,
 551       CmpValue.getDebugLoc(),
 552       MVT::Other, Chain,
 553       JumpT, CmpValue);
 554   return Result;
 555 }
 556
 557 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 558                                                    DebugLoc DL,
 559                                                    unsigned DwordOffset) const {
 560   unsigned ByteOffset = DwordOffset * 4;
 561   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 562                                       AMDGPUAS::PARAM_I_ADDRESS);
 563
 564   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 565   assert(isInt<16>(ByteOffset));
 566
 567   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 568                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 569                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 570                      false, false, false, 0);
 571 }
 572
 573 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 574
 575   MachineFunction &MF = DAG.getMachineFunction();
 576   const AMDGPUFrameLowering *TFL =
 577    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 578
 579   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 580   assert(FIN);
 581
 582   unsigned FrameIndex = FIN->getIndex();
 583   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 584   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 585 }
 586
 587 SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
 588   DebugLoc DL = Op.getDebugLoc();
 589   EVT VT = Op.getValueType();
 590
 591   return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
 592                      Op.getOperand(0),
 593                      Op.getOperand(0),
 594                      DAG.getNode(ISD::SUB, DL, VT,
 595                                  DAG.getConstant(32, MVT::i32),
 596                                  Op.getOperand(1)));
 597 }
 598
 599 bool R600TargetLowering::isZero(SDValue Op) const {
 600   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 601     return Cst->isNullValue();
 602   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 603     return CstFP->isZero();
 604   } else {
 605     return false;
 606   }
 607 }
 608
 609 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 610   DebugLoc DL = Op.getDebugLoc();
 611   EVT VT = Op.getValueType();
 612
 613   SDValue LHS = Op.getOperand(0);
 614   SDValue RHS = Op.getOperand(1);
 615   SDValue True = Op.getOperand(2);
 616   SDValue False = Op.getOperand(3);
 617   SDValue CC = Op.getOperand(4);
 618   SDValue Temp;
 619
 620   // LHS and RHS are guaranteed to be the same value type
 621   EVT CompareVT = LHS.getValueType();
 622
 623   // Check if we can lower this to a native operation.
 624
 625   // Try to lower to a CND* instruction:
 626   // CND* instructions requires RHS to be zero.  Some SELECT_CC nodes that
 627   // can be lowered to CND* instructions can also be lowered to SET*
 628   // instructions.  CND* instructions are cheaper, because they dont't
 629   // require additional instructions to convert their result to the correct
 630   // value type, so this check should be first.
 631   if (isZero(LHS) || isZero(RHS)) {
 632     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 633     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 634     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 635     if (CompareVT != VT) {
 636       // Bitcast True / False to the correct types.  This will end up being
 637       // a nop, but it allows us to define only a single pattern in the
 638       // .TD files for each CND* instruction rather than having to have
 639       // one pattern for integer True/False and one for fp True/False
 640       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 641       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 642     }
 643     if (isZero(LHS)) {
 644       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 645     }
 646
 647     switch (CCOpcode) {
 648     case ISD::SETONE:
 649     case ISD::SETUNE:
 650     case ISD::SETNE:
 651     case ISD::SETULE:
 652     case ISD::SETULT:
 653     case ISD::SETOLE:
 654     case ISD::SETOLT:
 655     case ISD::SETLE:
 656     case ISD::SETLT:
 657       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 658       Temp = True;
 659       True = False;
 660       False = Temp;
 661       break;
 662     default:
 663       break;
 664     }
 665     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 666         Cond, Zero,
 667         True, False,
 668         DAG.getCondCode(CCOpcode));
 669     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 670   }
 671
 672   // Try to lower to a SET* instruction:
 673   // We need all the operands of SELECT_CC to have the same value type, so if
 674   // necessary we need to change True and False to be the same type as LHS and
 675   // RHS, and then convert the result of the select_cc back to the correct type.
 676
 677   // Move hardware True/False values to the correct operand.
 678   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 679     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 680     std::swap(False, True);
 681     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 682   }
 683
 684   if (isHWTrueValue(True) && isHWFalseValue(False)) {
 685     if (CompareVT !=  VT) {
 686       if (VT == MVT::f32 && CompareVT == MVT::i32) {
 687         SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 688             LHS, RHS,
 689             DAG.getConstant(-1, MVT::i32),
 690             DAG.getConstant(0, MVT::i32),
 691             CC);
 692         // Convert integer values of true (-1) and false (0) to fp values of
 693         // true (1.0f) and false (0.0f).
 694         SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
 695                                                   DAG.getConstant(1, MVT::i32));
 696         return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
 697       } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
 698         SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 699             LHS, RHS,
 700             DAG.getConstantFP(1.0f, MVT::f32),
 701             DAG.getConstantFP(0.0f, MVT::f32),
 702             CC);
 703         // Convert fp values of true (1.0f) and false (0.0f) to integer values
 704         // of true (-1) and false (0).
 705         SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
 706         return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
 707       } else {
 708         // I don't think there will be any other type pairings.
 709         assert(!"Unhandled operand type parings in SELECT_CC");
 710       }
 711     } else {
 712       // This SELECT_CC is already legal.
 713       return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 714     }
 715   }
 716
 717   // Possible Min/Max pattern
 718   SDValue MinMax = LowerMinMax(Op, DAG);
 719   if (MinMax.getNode()) {
 720     return MinMax;
 721   }
 722
 723   // If we make it this for it means we have no native instructions to handle
 724   // this SELECT_CC, so we must lower it.
 725   SDValue HWTrue, HWFalse;
 726
 727   if (CompareVT == MVT::f32) {
 728     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 729     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 730   } else if (CompareVT == MVT::i32) {
 731     HWTrue = DAG.getConstant(-1, CompareVT);
 732     HWFalse = DAG.getConstant(0, CompareVT);
 733   }
 734   else {
 735     assert(!"Unhandled value type in LowerSELECT_CC");
 736   }
 737
 738   // Lower this unsupported SELECT_CC into a combination of two supported
 739   // SELECT_CC operations.
 740   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 741
 742   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 743       Cond, HWFalse,
 744       True, False,
 745       DAG.getCondCode(ISD::SETNE));
 746 }
 747
 748 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 749   return DAG.getNode(ISD::SELECT_CC,
 750       Op.getDebugLoc(),
 751       Op.getValueType(),
 752       Op.getOperand(0),
 753       DAG.getConstant(0, MVT::i32),
 754       Op.getOperand(1),
 755       Op.getOperand(2),
 756       DAG.getCondCode(ISD::SETNE));
 757 }
 758
 759 SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 760   SDValue Cond;
 761   SDValue LHS = Op.getOperand(0);
 762   SDValue RHS = Op.getOperand(1);
 763   SDValue CC  = Op.getOperand(2);
 764   DebugLoc DL = Op.getDebugLoc();
 765   assert(Op.getValueType() == MVT::i32);
 766   if (LHS.getValueType() == MVT::i32) {
 767     Cond = DAG.getNode(
 768         ISD::SELECT_CC,
 769         Op.getDebugLoc(),
 770         MVT::i32,
 771         LHS, RHS,
 772         DAG.getConstant(-1, MVT::i32),
 773         DAG.getConstant(0, MVT::i32),
 774         CC);
 775   } else if (LHS.getValueType() == MVT::f32) {
 776     Cond = DAG.getNode(
 777         ISD::SELECT_CC,
 778         Op.getDebugLoc(),
 779         MVT::f32,
 780         LHS, RHS,
 781         DAG.getConstantFP(1.0f, MVT::f32),
 782         DAG.getConstantFP(0.0f, MVT::f32),
 783         CC);
 784     Cond = DAG.getNode(
 785         ISD::FP_TO_SINT,
 786         DL,
 787         MVT::i32,
 788         Cond);
 789   } else {
 790     assert(0 && "Not valid type for set_cc");
 791   }
 792   Cond = DAG.getNode(
 793       ISD::AND,
 794       DL,
 795       MVT::i32,
 796       DAG.getConstant(1, MVT::i32),
 797       Cond);
 798   return Cond;
 799 }
 800
 801 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 802 /// convert these pointers to a register index.  Each register holds
 803 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 804 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 805 /// for indirect addressing.
 806 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 807                                                unsigned StackWidth,
 808                                                SelectionDAG &DAG) const {
 809   unsigned SRLPad;
 810   switch(StackWidth) {
 811   case 1:
 812     SRLPad = 2;
 813     break;
 814   case 2:
 815     SRLPad = 3;
 816     break;
 817   case 4:
 818     SRLPad = 4;
 819     break;
 820   default: llvm_unreachable("Invalid stack width");
 821   }
 822
 823   return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
 824                      DAG.getConstant(SRLPad, MVT::i32));
 825 }
 826
 827 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 828                                          unsigned ElemIdx,
 829                                          unsigned &Channel,
 830                                          unsigned &PtrIncr) const {
 831   switch (StackWidth) {
 832   default:
 833   case 1:
 834     Channel = 0;
 835     if (ElemIdx > 0) {
 836       PtrIncr = 1;
 837     } else {
 838       PtrIncr = 0;
 839     }
 840     break;
 841   case 2:
 842     Channel = ElemIdx % 2;
 843     if (ElemIdx == 2) {
 844       PtrIncr = 1;
 845     } else {
 846       PtrIncr = 0;
 847     }
 848     break;
 849   case 4:
 850     Channel = ElemIdx;
 851     PtrIncr = 0;
 852     break;
 853   }
 854 }
 855
 856 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 857   DebugLoc DL = Op.getDebugLoc();
 858   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 859   SDValue Chain = Op.getOperand(0);
 860   SDValue Value = Op.getOperand(1);
 861   SDValue Ptr = Op.getOperand(2);
 862
 863   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
 864       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
 865     // Convert pointer from byte address to dword address.
 866     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
 867                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
 868                                   Ptr, DAG.getConstant(2, MVT::i32)));
 869
 870     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
 871       assert(!"Truncated and indexed stores not supported yet");
 872     } else {
 873       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
 874     }
 875     return Chain;
 876   }
 877
 878   EVT ValueVT = Value.getValueType();
 879
 880   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
 881     return SDValue();
 882   }
 883
 884   // Lowering for indirect addressing
 885
 886   const MachineFunction &MF = DAG.getMachineFunction();
 887   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
 888                                          getTargetMachine().getFrameLowering());
 889   unsigned StackWidth = TFL->getStackWidth(MF);
 890
 891   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
 892
 893   if (ValueVT.isVector()) {
 894     unsigned NumElemVT = ValueVT.getVectorNumElements();
 895     EVT ElemVT = ValueVT.getVectorElementType();
 896     SDValue Stores[4];
 897
 898     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
 899                                       "vector width in load");
 900
 901     for (unsigned i = 0; i < NumElemVT; ++i) {
 902       unsigned Channel, PtrIncr;
 903       getStackAddress(StackWidth, i, Channel, PtrIncr);
 904       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
 905                         DAG.getConstant(PtrIncr, MVT::i32));
 906       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
 907                                  Value, DAG.getConstant(i, MVT::i32));
 908
 909       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
 910                               Chain, Elem, Ptr,
 911                               DAG.getTargetConstant(Channel, MVT::i32));
 912     }
 913      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
 914    } else {
 915     if (ValueVT == MVT::i8) {
 916       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
 917     }
 918     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
 919     DAG.getTargetConstant(0, MVT::i32)); // Channel
 920   }
 921
 922   return Chain;
 923 }
 924
 925 // return (512 + (kc_bank << 12)
 926 static int
 927 ConstantAddressBlock(unsigned AddressSpace) {
 928   switch (AddressSpace) {
 929   case AMDGPUAS::CONSTANT_BUFFER_0:
 930     return 512;
 931   case AMDGPUAS::CONSTANT_BUFFER_1:
 932     return 512 + 4096;
 933   case AMDGPUAS::CONSTANT_BUFFER_2:
 934     return 512 + 4096 * 2;
 935   case AMDGPUAS::CONSTANT_BUFFER_3:
 936     return 512 + 4096 * 3;
 937   case AMDGPUAS::CONSTANT_BUFFER_4:
 938     return 512 + 4096 * 4;
 939   case AMDGPUAS::CONSTANT_BUFFER_5:
 940     return 512 + 4096 * 5;
 941   case AMDGPUAS::CONSTANT_BUFFER_6:
 942     return 512 + 4096 * 6;
 943   case AMDGPUAS::CONSTANT_BUFFER_7:
 944     return 512 + 4096 * 7;
 945   case AMDGPUAS::CONSTANT_BUFFER_8:
 946     return 512 + 4096 * 8;
 947   case AMDGPUAS::CONSTANT_BUFFER_9:
 948     return 512 + 4096 * 9;
 949   case AMDGPUAS::CONSTANT_BUFFER_10:
 950     return 512 + 4096 * 10;
 951   case AMDGPUAS::CONSTANT_BUFFER_11:
 952     return 512 + 4096 * 11;
 953   case AMDGPUAS::CONSTANT_BUFFER_12:
 954     return 512 + 4096 * 12;
 955   case AMDGPUAS::CONSTANT_BUFFER_13:
 956     return 512 + 4096 * 13;
 957   case AMDGPUAS::CONSTANT_BUFFER_14:
 958     return 512 + 4096 * 14;
 959   case AMDGPUAS::CONSTANT_BUFFER_15:
 960     return 512 + 4096 * 15;
 961   default:
 962     return -1;
 963   }
 964 }
 965
 966 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 967 {
 968   EVT VT = Op.getValueType();
 969   DebugLoc DL = Op.getDebugLoc();
 970   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
 971   SDValue Chain = Op.getOperand(0);
 972   SDValue Ptr = Op.getOperand(1);
 973   SDValue LoweredLoad;
 974
 975   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
 976   if (ConstantBlock > -1) {
 977     SDValue Result;
 978     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
 979         dyn_cast<Constant>(LoadNode->getSrcValue())) {
 980       SDValue Slots[4];
 981       for (unsigned i = 0; i < 4; i++) {
 982         // We want Const position encoded with the following formula :
 983         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
 984         // const_index is Ptr computed by llvm using an alignment of 16.
 985         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
 986         // then div by 4 at the ISel step
 987         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
 988             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
 989         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
 990       }
 991       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
 992     } else {
 993       // non constant ptr cant be folded, keeps it as a v4f32 load
 994       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
 995           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
 996           );
 997     }
 998
 999     if (!VT.isVector()) {
1000       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1001           DAG.getConstant(0, MVT::i32));
1002     }
1003
1004     SDValue MergedValues[2] = {
1005         Result,
1006         Chain
1007     };
1008     return DAG.getMergeValues(MergedValues, 2, DL);
1009   }
1010
1011   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1012     return SDValue();
1013   }
1014
1015   // Lowering for indirect addressing
1016   const MachineFunction &MF = DAG.getMachineFunction();
1017   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1018                                          getTargetMachine().getFrameLowering());
1019   unsigned StackWidth = TFL->getStackWidth(MF);
1020
1021   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1022
1023   if (VT.isVector()) {
1024     unsigned NumElemVT = VT.getVectorNumElements();
1025     EVT ElemVT = VT.getVectorElementType();
1026     SDValue Loads[4];
1027
1028     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1029                                       "vector width in load");
1030
1031     for (unsigned i = 0; i < NumElemVT; ++i) {
1032       unsigned Channel, PtrIncr;
1033       getStackAddress(StackWidth, i, Channel, PtrIncr);
1034       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1035                         DAG.getConstant(PtrIncr, MVT::i32));
1036       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1037                              Chain, Ptr,
1038                              DAG.getTargetConstant(Channel, MVT::i32),
1039                              Op.getOperand(2));
1040     }
1041     for (unsigned i = NumElemVT; i < 4; ++i) {
1042       Loads[i] = DAG.getUNDEF(ElemVT);
1043     }
1044     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1045     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1046   } else {
1047     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1048                               Chain, Ptr,
1049                               DAG.getTargetConstant(0, MVT::i32), // Channel
1050                               Op.getOperand(2));
1051   }
1052
1053   SDValue Ops[2];
1054   Ops[0] = LoweredLoad;
1055   Ops[1] = Chain;
1056
1057   return DAG.getMergeValues(Ops, 2, DL);
1058 }
1059
1060 SDValue R600TargetLowering::LowerFPOW(SDValue Op,
1061     SelectionDAG &DAG) const {
1062   DebugLoc DL = Op.getDebugLoc();
1063   EVT VT = Op.getValueType();
1064   SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
1065   SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
1066   return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
1067 }
1068
1069 /// XXX Only kernel functions are supported, so we can assume for now that
1070 /// every function is a kernel function, but in the future we should use
1071 /// separate calling conventions for kernel and non-kernel functions.
1072 SDValue R600TargetLowering::LowerFormalArguments(
1073                                       SDValue Chain,
1074                                       CallingConv::ID CallConv,
1075                                       bool isVarArg,
1076                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1077                                       DebugLoc DL, SelectionDAG &DAG,
1078                                       SmallVectorImpl<SDValue> &InVals) const {
1079   unsigned ParamOffsetBytes = 36;
1080   Function::const_arg_iterator FuncArg =
1081                             DAG.getMachineFunction().getFunction()->arg_begin();
1082   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
1083     EVT VT = Ins[i].VT;
1084     Type *ArgType = FuncArg->getType();
1085     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
1086                              32 : ArgType->getPrimitiveSizeInBits();
1087     unsigned ArgBytes = ArgSizeInBits >> 3;
1088     EVT ArgVT;
1089     if (ArgSizeInBits < VT.getSizeInBits()) {
1090       assert(!ArgType->isFloatTy() &&
1091              "Extending floating point arguments not supported yet");
1092       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
1093     } else {
1094       ArgVT = VT;
1095     }
1096     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1097                                                     AMDGPUAS::PARAM_I_ADDRESS);
1098     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
1099                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
1100                                        MachinePointerInfo(new Argument(PtrTy)),
1101                                        ArgVT, false, false, ArgBytes);
1102     InVals.push_back(Arg);
1103     ParamOffsetBytes += ArgBytes;
1104   }
1105   return Chain;
1106 }
1107
1108 EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
1109    if (!VT.isVector()) return MVT::i32;
1110    return VT.changeVectorElementTypeToInteger();
1111 }
1112
1113 //===----------------------------------------------------------------------===//
1114 // Custom DAG Optimizations
1115 //===----------------------------------------------------------------------===//
1116
1117 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1118                                               DAGCombinerInfo &DCI) const {
1119   SelectionDAG &DAG = DCI.DAG;
1120
1121   switch (N->getOpcode()) {
1122   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1123   case ISD::FP_ROUND: {
1124       SDValue Arg = N->getOperand(0);
1125       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1126         return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
1127                            Arg.getOperand(0));
1128       }
1129       break;
1130     }
1131   // Extract_vec (Build_vector) generated by custom lowering
1132   // also needs to be customly combined
1133   case ISD::EXTRACT_VECTOR_ELT: {
1134     SDValue Arg = N->getOperand(0);
1135     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1136       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1137         unsigned Element = Const->getZExtValue();
1138         return Arg->getOperand(Element);
1139       }
1140     }
1141     if (Arg.getOpcode() == ISD::BITCAST &&
1142         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1143       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1144         unsigned Element = Const->getZExtValue();
1145         return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
1146             Arg->getOperand(0).getOperand(Element));
1147       }
1148     }
1149   }
1150   }
1151   return SDValue();
1152 }