lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/MachineInstrBuilder.h"
  20 #include "llvm/CodeGen/MachineRegisterInfo.h"
  21 #include "llvm/CodeGen/SelectionDAG.h"
  22 #include "llvm/IR/Argument.h"
  23 #include "llvm/IR/Function.h"
  24
  25 using namespace llvm;
  26
  27 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  28     AMDGPUTargetLowering(TM),
  29     TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
  30   setOperationAction(ISD::MUL, MVT::i64, Expand);
  31   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  32   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  33   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  34   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  35   computeRegisterProperties();
  36
  37   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  38   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  40   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  41
  42   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
  43   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
  44   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
  45   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
  46   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
  47   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
  48   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
  49   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
  50   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  51
  52   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
  53   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
  54
  55   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  56
  57   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  58   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  59   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  60   setOperationAction(ISD::FPOW, MVT::f32, Custom);
  61
  62   setOperationAction(ISD::ROTL, MVT::i32, Custom);
  63
  64   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  65   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::i32, Custom);
  68   setOperationAction(ISD::SETCC, MVT::f32, Custom);
  69   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  70
  71   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  72   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  73
  74   setOperationAction(ISD::STORE, MVT::i32, Custom);
  75   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  76
  77   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  78   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  79   setTargetDAGCombine(ISD::FP_ROUND);
  80   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  81
  82   setSchedulingPreference(Sched::VLIW);
  83 }
  84
  85 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
  86     MachineInstr * MI, MachineBasicBlock * BB) const {
  87   MachineFunction * MF = BB->getParent();
  88   MachineRegisterInfo &MRI = MF->getRegInfo();
  89   MachineBasicBlock::iterator I = *MI;
  90
  91   switch (MI->getOpcode()) {
  92   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
  93   case AMDGPU::SHADER_TYPE: break;
  94   case AMDGPU::CLAMP_R600: {
  95     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
  96                                                    AMDGPU::MOV,
  97                                                    MI->getOperand(0).getReg(),
  98                                                    MI->getOperand(1).getReg());
  99     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 100     break;
 101   }
 102
 103   case AMDGPU::FABS_R600: {
 104     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 105                                                     AMDGPU::MOV,
 106                                                     MI->getOperand(0).getReg(),
 107                                                     MI->getOperand(1).getReg());
 108     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 109     break;
 110   }
 111
 112   case AMDGPU::FNEG_R600: {
 113     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 114                                                     AMDGPU::MOV,
 115                                                     MI->getOperand(0).getReg(),
 116                                                     MI->getOperand(1).getReg());
 117     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 118     break;
 119   }
 120
 121   case AMDGPU::MASK_WRITE: {
 122     unsigned maskedRegister = MI->getOperand(0).getReg();
 123     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 124     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 125     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 126     break;
 127   }
 128
 129   case AMDGPU::MOV_IMM_F32:
 130     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 131                      MI->getOperand(1).getFPImm()->getValueAPF()
 132                          .bitcastToAPInt().getZExtValue());
 133     break;
 134   case AMDGPU::MOV_IMM_I32:
 135     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 136                      MI->getOperand(1).getImm());
 137     break;
 138
 139
 140   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 141   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 142     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 143
 144     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 145             .addOperand(MI->getOperand(0))
 146             .addOperand(MI->getOperand(1))
 147             .addImm(EOP); // Set End of program bit
 148     break;
 149   }
 150
 151   case AMDGPU::TXD: {
 152     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 153     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 154
 155     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 156             .addOperand(MI->getOperand(3))
 157             .addOperand(MI->getOperand(4))
 158             .addOperand(MI->getOperand(5))
 159             .addOperand(MI->getOperand(6));
 160     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 161             .addOperand(MI->getOperand(2))
 162             .addOperand(MI->getOperand(4))
 163             .addOperand(MI->getOperand(5))
 164             .addOperand(MI->getOperand(6));
 165     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 166             .addOperand(MI->getOperand(0))
 167             .addOperand(MI->getOperand(1))
 168             .addOperand(MI->getOperand(4))
 169             .addOperand(MI->getOperand(5))
 170             .addOperand(MI->getOperand(6))
 171             .addReg(T0, RegState::Implicit)
 172             .addReg(T1, RegState::Implicit);
 173     break;
 174   }
 175
 176   case AMDGPU::TXD_SHADOW: {
 177     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 178     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 179
 180     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 181             .addOperand(MI->getOperand(3))
 182             .addOperand(MI->getOperand(4))
 183             .addOperand(MI->getOperand(5))
 184             .addOperand(MI->getOperand(6));
 185     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 186             .addOperand(MI->getOperand(2))
 187             .addOperand(MI->getOperand(4))
 188             .addOperand(MI->getOperand(5))
 189             .addOperand(MI->getOperand(6));
 190     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 191             .addOperand(MI->getOperand(0))
 192             .addOperand(MI->getOperand(1))
 193             .addOperand(MI->getOperand(4))
 194             .addOperand(MI->getOperand(5))
 195             .addOperand(MI->getOperand(6))
 196             .addReg(T0, RegState::Implicit)
 197             .addReg(T1, RegState::Implicit);
 198     break;
 199   }
 200
 201   case AMDGPU::BRANCH:
 202       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 203               .addOperand(MI->getOperand(0))
 204               .addReg(0);
 205       break;
 206
 207   case AMDGPU::BRANCH_COND_f32: {
 208     MachineInstr *NewMI =
 209       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 210               AMDGPU::PREDICATE_BIT)
 211               .addOperand(MI->getOperand(1))
 212               .addImm(OPCODE_IS_NOT_ZERO)
 213               .addImm(0); // Flags
 214     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 215     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 216             .addOperand(MI->getOperand(0))
 217             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 218     break;
 219   }
 220
 221   case AMDGPU::BRANCH_COND_i32: {
 222     MachineInstr *NewMI =
 223       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 224             AMDGPU::PREDICATE_BIT)
 225             .addOperand(MI->getOperand(1))
 226             .addImm(OPCODE_IS_NOT_ZERO_INT)
 227             .addImm(0); // Flags
 228     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 229     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 230            .addOperand(MI->getOperand(0))
 231             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 232     break;
 233   }
 234
 235   case AMDGPU::EG_ExportSwz:
 236   case AMDGPU::R600_ExportSwz: {
 237     // Instruction is left unmodified if its not the last one of its type
 238     bool isLastInstructionOfItsType = true;
 239     unsigned InstExportType = MI->getOperand(1).getImm();
 240     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 241          EndBlock = BB->end(); NextExportInst != EndBlock;
 242          NextExportInst = llvm::next(NextExportInst)) {
 243       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 244           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 245         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 246             .getImm();
 247         if (CurrentInstExportType == InstExportType) {
 248           isLastInstructionOfItsType = false;
 249           break;
 250         }
 251       }
 252     }
 253     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 254     if (!EOP && !isLastInstructionOfItsType)
 255       return BB;
 256     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 257     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 258             .addOperand(MI->getOperand(0))
 259             .addOperand(MI->getOperand(1))
 260             .addOperand(MI->getOperand(2))
 261             .addOperand(MI->getOperand(3))
 262             .addOperand(MI->getOperand(4))
 263             .addOperand(MI->getOperand(5))
 264             .addOperand(MI->getOperand(6))
 265             .addImm(CfInst)
 266             .addImm(EOP);
 267     break;
 268   }
 269   case AMDGPU::RETURN: {
 270     // RETURN instructions must have the live-out registers as implicit uses,
 271     // otherwise they appear dead.
 272     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 273     MachineInstrBuilder MIB(*MF, MI);
 274     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 275       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 276     return BB;
 277   }
 278   }
 279
 280   MI->eraseFromParent();
 281   return BB;
 282 }
 283
 284 //===----------------------------------------------------------------------===//
 285 // Custom DAG Lowering Operations
 286 //===----------------------------------------------------------------------===//
 287
 288 using namespace llvm::Intrinsic;
 289 using namespace llvm::AMDGPUIntrinsic;
 290
 291 static SDValue
 292 InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
 293     unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
 294     SDValue Scalar, SDValue Chain) {
 295   if (!ExportMap[Slot]) {
 296     SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
 297       DL, MVT::v4f32,
 298       DAG.getUNDEF(MVT::v4f32),
 299       Scalar,
 300       DAG.getConstant(Channel, MVT::i32));
 301
 302     unsigned Mask = 1 << Channel;
 303
 304     const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
 305         DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
 306         DAG.getConstant(Mask, MVT::i32)};
 307
 308     SDValue Res =  DAG.getNode(
 309         AMDGPUISD::EXPORT,
 310         DL,
 311         MVT::Other,
 312         Ops, 6);
 313      ExportMap[Slot] = Res.getNode();
 314      return Res;
 315   }
 316
 317   SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
 318   SDValue PreviousVector = ExportInstruction->getOperand(1);
 319   SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
 320       DL, MVT::v4f32,
 321       PreviousVector,
 322       Scalar,
 323       DAG.getConstant(Channel, MVT::i32));
 324
 325   unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
 326       ->getZExtValue();
 327   Mask |= (1 << Channel);
 328
 329   const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
 330       DAG.getConstant(Inst, MVT::i32),
 331       DAG.getConstant(Type, MVT::i32),
 332       DAG.getConstant(Slot, MVT::i32),
 333       DAG.getConstant(Mask, MVT::i32)};
 334
 335   DAG.UpdateNodeOperands(ExportInstruction,
 336       Ops, 6);
 337
 338   return Chain;
 339
 340 }
 341
 342 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 343   switch (Op.getOpcode()) {
 344   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 345   case ISD::BR_CC: return LowerBR_CC(Op, DAG);
 346   case ISD::ROTL: return LowerROTL(Op, DAG);
 347   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 348   case ISD::SELECT: return LowerSELECT(Op, DAG);
 349   case ISD::SETCC: return LowerSETCC(Op, DAG);
 350   case ISD::STORE: return LowerSTORE(Op, DAG);
 351   case ISD::LOAD: return LowerLOAD(Op, DAG);
 352   case ISD::FPOW: return LowerFPOW(Op, DAG);
 353   case ISD::INTRINSIC_VOID: {
 354     SDValue Chain = Op.getOperand(0);
 355     unsigned IntrinsicID =
 356                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 357     switch (IntrinsicID) {
 358     case AMDGPUIntrinsic::AMDGPU_store_output: {
 359       MachineFunction &MF = DAG.getMachineFunction();
 360       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 361       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 362       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 363       MFI->LiveOuts.push_back(Reg);
 364       return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
 365     }
 366     case AMDGPUIntrinsic::R600_store_pixel_color: {
 367       MachineFunction &MF = DAG.getMachineFunction();
 368       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 369       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 370
 371       SDNode **OutputsMap = MFI->Outputs;
 372       return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
 373           RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
 374           Chain);
 375
 376     }
 377
 378     // default for switch(IntrinsicID)
 379     default: break;
 380     }
 381     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 382     break;
 383   }
 384   case ISD::INTRINSIC_WO_CHAIN: {
 385     unsigned IntrinsicID =
 386                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 387     EVT VT = Op.getValueType();
 388     DebugLoc DL = Op.getDebugLoc();
 389     switch(IntrinsicID) {
 390     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 391     case AMDGPUIntrinsic::R600_load_input: {
 392       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 393       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 394       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
 395     }
 396
 397     case AMDGPUIntrinsic::R600_interp_input: {
 398       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 399       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 400       MachineSDNode *interp;
 401       if (ijb < 0) {
 402         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 403             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 404         return DAG.getTargetExtractSubreg(
 405             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 406             DL, MVT::f32, SDValue(interp, 0));
 407       }
 408
 409       if (slot % 4 < 2)
 410         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 411             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 412             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 413                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 414             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 415                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 416       else
 417         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 418             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 419             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 420                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 421             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 422                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 423
 424       return SDValue(interp, slot % 2);
 425     }
 426
 427     case r600_read_ngroups_x:
 428       return LowerImplicitParameter(DAG, VT, DL, 0);
 429     case r600_read_ngroups_y:
 430       return LowerImplicitParameter(DAG, VT, DL, 1);
 431     case r600_read_ngroups_z:
 432       return LowerImplicitParameter(DAG, VT, DL, 2);
 433     case r600_read_global_size_x:
 434       return LowerImplicitParameter(DAG, VT, DL, 3);
 435     case r600_read_global_size_y:
 436       return LowerImplicitParameter(DAG, VT, DL, 4);
 437     case r600_read_global_size_z:
 438       return LowerImplicitParameter(DAG, VT, DL, 5);
 439     case r600_read_local_size_x:
 440       return LowerImplicitParameter(DAG, VT, DL, 6);
 441     case r600_read_local_size_y:
 442       return LowerImplicitParameter(DAG, VT, DL, 7);
 443     case r600_read_local_size_z:
 444       return LowerImplicitParameter(DAG, VT, DL, 8);
 445
 446     case r600_read_tgid_x:
 447       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 448                                   AMDGPU::T1_X, VT);
 449     case r600_read_tgid_y:
 450       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 451                                   AMDGPU::T1_Y, VT);
 452     case r600_read_tgid_z:
 453       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 454                                   AMDGPU::T1_Z, VT);
 455     case r600_read_tidig_x:
 456       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 457                                   AMDGPU::T0_X, VT);
 458     case r600_read_tidig_y:
 459       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 460                                   AMDGPU::T0_Y, VT);
 461     case r600_read_tidig_z:
 462       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 463                                   AMDGPU::T0_Z, VT);
 464     }
 465     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 466     break;
 467   }
 468   } // end switch(Op.getOpcode())
 469   return SDValue();
 470 }
 471
 472 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 473                                             SmallVectorImpl<SDValue> &Results,
 474                                             SelectionDAG &DAG) const {
 475   switch (N->getOpcode()) {
 476   default: return;
 477   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 478     return;
 479   case ISD::LOAD: {
 480     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 481     Results.push_back(SDValue(Node, 0));
 482     Results.push_back(SDValue(Node, 1));
 483     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 484     // function
 485     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 486     return;
 487   }
 488   }
 489 }
 490
 491 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 492   return DAG.getNode(
 493       ISD::SETCC,
 494       Op.getDebugLoc(),
 495       MVT::i1,
 496       Op, DAG.getConstantFP(0.0f, MVT::f32),
 497       DAG.getCondCode(ISD::SETNE)
 498       );
 499 }
 500
 501 SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 502   SDValue Chain = Op.getOperand(0);
 503   SDValue CC = Op.getOperand(1);
 504   SDValue LHS   = Op.getOperand(2);
 505   SDValue RHS   = Op.getOperand(3);
 506   SDValue JumpT  = Op.getOperand(4);
 507   SDValue CmpValue;
 508   SDValue Result;
 509
 510   if (LHS.getValueType() == MVT::i32) {
 511     CmpValue = DAG.getNode(
 512         ISD::SELECT_CC,
 513         Op.getDebugLoc(),
 514         MVT::i32,
 515         LHS, RHS,
 516         DAG.getConstant(-1, MVT::i32),
 517         DAG.getConstant(0, MVT::i32),
 518         CC);
 519   } else if (LHS.getValueType() == MVT::f32) {
 520     CmpValue = DAG.getNode(
 521         ISD::SELECT_CC,
 522         Op.getDebugLoc(),
 523         MVT::f32,
 524         LHS, RHS,
 525         DAG.getConstantFP(1.0f, MVT::f32),
 526         DAG.getConstantFP(0.0f, MVT::f32),
 527         CC);
 528   } else {
 529     assert(0 && "Not valid type for br_cc");
 530   }
 531   Result = DAG.getNode(
 532       AMDGPUISD::BRANCH_COND,
 533       CmpValue.getDebugLoc(),
 534       MVT::Other, Chain,
 535       JumpT, CmpValue);
 536   return Result;
 537 }
 538
 539 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 540                                                    DebugLoc DL,
 541                                                    unsigned DwordOffset) const {
 542   unsigned ByteOffset = DwordOffset * 4;
 543   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 544                                       AMDGPUAS::PARAM_I_ADDRESS);
 545
 546   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 547   assert(isInt<16>(ByteOffset));
 548
 549   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 550                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 551                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 552                      false, false, false, 0);
 553 }
 554
 555 SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
 556   DebugLoc DL = Op.getDebugLoc();
 557   EVT VT = Op.getValueType();
 558
 559   return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
 560                      Op.getOperand(0),
 561                      Op.getOperand(0),
 562                      DAG.getNode(ISD::SUB, DL, VT,
 563                                  DAG.getConstant(32, MVT::i32),
 564                                  Op.getOperand(1)));
 565 }
 566
 567 bool R600TargetLowering::isZero(SDValue Op) const {
 568   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 569     return Cst->isNullValue();
 570   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 571     return CstFP->isZero();
 572   } else {
 573     return false;
 574   }
 575 }
 576
 577 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 578   DebugLoc DL = Op.getDebugLoc();
 579   EVT VT = Op.getValueType();
 580
 581   SDValue LHS = Op.getOperand(0);
 582   SDValue RHS = Op.getOperand(1);
 583   SDValue True = Op.getOperand(2);
 584   SDValue False = Op.getOperand(3);
 585   SDValue CC = Op.getOperand(4);
 586   SDValue Temp;
 587
 588   // LHS and RHS are guaranteed to be the same value type
 589   EVT CompareVT = LHS.getValueType();
 590
 591   // Check if we can lower this to a native operation.
 592
 593   // Try to lower to a CND* instruction:
 594   // CND* instructions requires RHS to be zero.  Some SELECT_CC nodes that
 595   // can be lowered to CND* instructions can also be lowered to SET*
 596   // instructions.  CND* instructions are cheaper, because they dont't
 597   // require additional instructions to convert their result to the correct
 598   // value type, so this check should be first.
 599   if (isZero(LHS) || isZero(RHS)) {
 600     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 601     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 602     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 603     if (CompareVT != VT) {
 604       // Bitcast True / False to the correct types.  This will end up being
 605       // a nop, but it allows us to define only a single pattern in the
 606       // .TD files for each CND* instruction rather than having to have
 607       // one pattern for integer True/False and one for fp True/False
 608       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 609       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 610     }
 611     if (isZero(LHS)) {
 612       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 613     }
 614
 615     switch (CCOpcode) {
 616     case ISD::SETONE:
 617     case ISD::SETUNE:
 618     case ISD::SETNE:
 619     case ISD::SETULE:
 620     case ISD::SETULT:
 621     case ISD::SETOLE:
 622     case ISD::SETOLT:
 623     case ISD::SETLE:
 624     case ISD::SETLT:
 625       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 626       Temp = True;
 627       True = False;
 628       False = Temp;
 629       break;
 630     default:
 631       break;
 632     }
 633     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 634         Cond, Zero,
 635         True, False,
 636         DAG.getCondCode(CCOpcode));
 637     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 638   }
 639
 640   // Try to lower to a SET* instruction:
 641   // We need all the operands of SELECT_CC to have the same value type, so if
 642   // necessary we need to change True and False to be the same type as LHS and
 643   // RHS, and then convert the result of the select_cc back to the correct type.
 644
 645   // Move hardware True/False values to the correct operand.
 646   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 647     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 648     std::swap(False, True);
 649     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 650   }
 651
 652   if (isHWTrueValue(True) && isHWFalseValue(False)) {
 653     if (CompareVT !=  VT) {
 654       if (VT == MVT::f32 && CompareVT == MVT::i32) {
 655         SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 656             LHS, RHS,
 657             DAG.getConstant(-1, MVT::i32),
 658             DAG.getConstant(0, MVT::i32),
 659             CC);
 660         // Convert integer values of true (-1) and false (0) to fp values of
 661         // true (1.0f) and false (0.0f).
 662         SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
 663                                                   DAG.getConstant(1, MVT::i32));
 664         return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
 665       } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
 666         SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 667             LHS, RHS,
 668             DAG.getConstantFP(1.0f, MVT::f32),
 669             DAG.getConstantFP(0.0f, MVT::f32),
 670             CC);
 671         // Convert fp values of true (1.0f) and false (0.0f) to integer values
 672         // of true (-1) and false (0).
 673         SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
 674         return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
 675       } else {
 676         // I don't think there will be any other type pairings.
 677         assert(!"Unhandled operand type parings in SELECT_CC");
 678       }
 679     } else {
 680       // This SELECT_CC is already legal.
 681       return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 682     }
 683   }
 684
 685   // Possible Min/Max pattern
 686   SDValue MinMax = LowerMinMax(Op, DAG);
 687   if (MinMax.getNode()) {
 688     return MinMax;
 689   }
 690
 691   // If we make it this for it means we have no native instructions to handle
 692   // this SELECT_CC, so we must lower it.
 693   SDValue HWTrue, HWFalse;
 694
 695   if (CompareVT == MVT::f32) {
 696     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 697     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 698   } else if (CompareVT == MVT::i32) {
 699     HWTrue = DAG.getConstant(-1, CompareVT);
 700     HWFalse = DAG.getConstant(0, CompareVT);
 701   }
 702   else {
 703     assert(!"Unhandled value type in LowerSELECT_CC");
 704   }
 705
 706   // Lower this unsupported SELECT_CC into a combination of two supported
 707   // SELECT_CC operations.
 708   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 709
 710   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 711       Cond, HWFalse,
 712       True, False,
 713       DAG.getCondCode(ISD::SETNE));
 714 }
 715
 716 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 717   return DAG.getNode(ISD::SELECT_CC,
 718       Op.getDebugLoc(),
 719       Op.getValueType(),
 720       Op.getOperand(0),
 721       DAG.getConstant(0, MVT::i32),
 722       Op.getOperand(1),
 723       Op.getOperand(2),
 724       DAG.getCondCode(ISD::SETNE));
 725 }
 726
 727 SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 728   SDValue Cond;
 729   SDValue LHS = Op.getOperand(0);
 730   SDValue RHS = Op.getOperand(1);
 731   SDValue CC  = Op.getOperand(2);
 732   DebugLoc DL = Op.getDebugLoc();
 733   assert(Op.getValueType() == MVT::i32);
 734   if (LHS.getValueType() == MVT::i32) {
 735     Cond = DAG.getNode(
 736         ISD::SELECT_CC,
 737         Op.getDebugLoc(),
 738         MVT::i32,
 739         LHS, RHS,
 740         DAG.getConstant(-1, MVT::i32),
 741         DAG.getConstant(0, MVT::i32),
 742         CC);
 743   } else if (LHS.getValueType() == MVT::f32) {
 744     Cond = DAG.getNode(
 745         ISD::SELECT_CC,
 746         Op.getDebugLoc(),
 747         MVT::f32,
 748         LHS, RHS,
 749         DAG.getConstantFP(1.0f, MVT::f32),
 750         DAG.getConstantFP(0.0f, MVT::f32),
 751         CC);
 752     Cond = DAG.getNode(
 753         ISD::FP_TO_SINT,
 754         DL,
 755         MVT::i32,
 756         Cond);
 757   } else {
 758     assert(0 && "Not valid type for set_cc");
 759   }
 760   Cond = DAG.getNode(
 761       ISD::AND,
 762       DL,
 763       MVT::i32,
 764       DAG.getConstant(1, MVT::i32),
 765       Cond);
 766   return Cond;
 767 }
 768
 769 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 770   DebugLoc DL = Op.getDebugLoc();
 771   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 772   SDValue Chain = Op.getOperand(0);
 773   SDValue Value = Op.getOperand(1);
 774   SDValue Ptr = Op.getOperand(2);
 775
 776   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
 777       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
 778     // Convert pointer from byte address to dword address.
 779     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
 780                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
 781                                   Ptr, DAG.getConstant(2, MVT::i32)));
 782
 783     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
 784       assert(!"Truncated and indexed stores not supported yet");
 785     } else {
 786       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
 787     }
 788     return Chain;
 789   }
 790   return SDValue();
 791 }
 792
 793 // return (512 + (kc_bank << 12)
 794 static int
 795 ConstantAddressBlock(unsigned AddressSpace) {
 796   switch (AddressSpace) {
 797   case AMDGPUAS::CONSTANT_BUFFER_0:
 798     return 512;
 799   case AMDGPUAS::CONSTANT_BUFFER_1:
 800     return 512 + 4096;
 801   case AMDGPUAS::CONSTANT_BUFFER_2:
 802     return 512 + 4096 * 2;
 803   case AMDGPUAS::CONSTANT_BUFFER_3:
 804     return 512 + 4096 * 3;
 805   case AMDGPUAS::CONSTANT_BUFFER_4:
 806     return 512 + 4096 * 4;
 807   case AMDGPUAS::CONSTANT_BUFFER_5:
 808     return 512 + 4096 * 5;
 809   case AMDGPUAS::CONSTANT_BUFFER_6:
 810     return 512 + 4096 * 6;
 811   case AMDGPUAS::CONSTANT_BUFFER_7:
 812     return 512 + 4096 * 7;
 813   case AMDGPUAS::CONSTANT_BUFFER_8:
 814     return 512 + 4096 * 8;
 815   case AMDGPUAS::CONSTANT_BUFFER_9:
 816     return 512 + 4096 * 9;
 817   case AMDGPUAS::CONSTANT_BUFFER_10:
 818     return 512 + 4096 * 10;
 819   case AMDGPUAS::CONSTANT_BUFFER_11:
 820     return 512 + 4096 * 11;
 821   case AMDGPUAS::CONSTANT_BUFFER_12:
 822     return 512 + 4096 * 12;
 823   case AMDGPUAS::CONSTANT_BUFFER_13:
 824     return 512 + 4096 * 13;
 825   case AMDGPUAS::CONSTANT_BUFFER_14:
 826     return 512 + 4096 * 14;
 827   case AMDGPUAS::CONSTANT_BUFFER_15:
 828     return 512 + 4096 * 15;
 829   default:
 830     return -1;
 831   }
 832 }
 833
 834 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 835 {
 836   EVT VT = Op.getValueType();
 837   DebugLoc DL = Op.getDebugLoc();
 838   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
 839   SDValue Chain = Op.getOperand(0);
 840   SDValue Ptr = Op.getOperand(1);
 841   SDValue LoweredLoad;
 842
 843   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
 844   if (ConstantBlock > -1) {
 845     SDValue Result;
 846     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
 847         dyn_cast<Constant>(LoadNode->getSrcValue())) {
 848       SDValue Slots[4];
 849       for (unsigned i = 0; i < 4; i++) {
 850         // We want Const position encoded with the following formula :
 851         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
 852         // const_index is Ptr computed by llvm using an alignment of 16.
 853         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
 854         // then div by 4 at the ISel step
 855         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
 856             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
 857         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
 858       }
 859       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
 860     } else {
 861       // non constant ptr cant be folded, keeps it as a v4f32 load
 862       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
 863           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
 864           );
 865     }
 866
 867     if (!VT.isVector()) {
 868       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
 869           DAG.getConstant(0, MVT::i32));
 870     }
 871
 872     SDValue MergedValues[2] = {
 873         Result,
 874         Chain
 875     };
 876     return DAG.getMergeValues(MergedValues, 2, DL);
 877   }
 878
 879   return SDValue();
 880 }
 881
 882 SDValue R600TargetLowering::LowerFPOW(SDValue Op,
 883     SelectionDAG &DAG) const {
 884   DebugLoc DL = Op.getDebugLoc();
 885   EVT VT = Op.getValueType();
 886   SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
 887   SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
 888   return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
 889 }
 890
 891 /// XXX Only kernel functions are supported, so we can assume for now that
 892 /// every function is a kernel function, but in the future we should use
 893 /// separate calling conventions for kernel and non-kernel functions.
 894 SDValue R600TargetLowering::LowerFormalArguments(
 895                                       SDValue Chain,
 896                                       CallingConv::ID CallConv,
 897                                       bool isVarArg,
 898                                       const SmallVectorImpl<ISD::InputArg> &Ins,
 899                                       DebugLoc DL, SelectionDAG &DAG,
 900                                       SmallVectorImpl<SDValue> &InVals) const {
 901   unsigned ParamOffsetBytes = 36;
 902   Function::const_arg_iterator FuncArg =
 903                             DAG.getMachineFunction().getFunction()->arg_begin();
 904   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
 905     EVT VT = Ins[i].VT;
 906     Type *ArgType = FuncArg->getType();
 907     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
 908                              32 : ArgType->getPrimitiveSizeInBits();
 909     unsigned ArgBytes = ArgSizeInBits >> 3;
 910     EVT ArgVT;
 911     if (ArgSizeInBits < VT.getSizeInBits()) {
 912       assert(!ArgType->isFloatTy() &&
 913              "Extending floating point arguments not supported yet");
 914       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
 915     } else {
 916       ArgVT = VT;
 917     }
 918     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 919                                                     AMDGPUAS::PARAM_I_ADDRESS);
 920     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
 921                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
 922                                        MachinePointerInfo(new Argument(PtrTy)),
 923                                        ArgVT, false, false, ArgBytes);
 924     InVals.push_back(Arg);
 925     ParamOffsetBytes += ArgBytes;
 926   }
 927   return Chain;
 928 }
 929
 930 EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
 931    if (!VT.isVector()) return MVT::i32;
 932    return VT.changeVectorElementTypeToInteger();
 933 }
 934
 935 //===----------------------------------------------------------------------===//
 936 // Custom DAG Optimizations
 937 //===----------------------------------------------------------------------===//
 938
 939 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
 940                                               DAGCombinerInfo &DCI) const {
 941   SelectionDAG &DAG = DCI.DAG;
 942
 943   switch (N->getOpcode()) {
 944   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
 945   case ISD::FP_ROUND: {
 946       SDValue Arg = N->getOperand(0);
 947       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
 948         return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
 949                            Arg.getOperand(0));
 950       }
 951       break;
 952     }
 953   // Extract_vec (Build_vector) generated by custom lowering
 954   // also needs to be customly combined
 955   case ISD::EXTRACT_VECTOR_ELT: {
 956     SDValue Arg = N->getOperand(0);
 957     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
 958       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
 959         unsigned Element = Const->getZExtValue();
 960         return Arg->getOperand(Element);
 961       }
 962     }
 963     if (Arg.getOpcode() == ISD::BITCAST &&
 964         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
 965       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
 966         unsigned Element = Const->getZExtValue();
 967         return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
 968             Arg->getOperand(0).getOperand(Element));
 969       }
 970     }
 971   }
 972   }
 973   return SDValue();
 974 }