lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/MachineFrameInfo.h"
  20 #include "llvm/CodeGen/MachineInstrBuilder.h"
  21 #include "llvm/CodeGen/MachineRegisterInfo.h"
  22 #include "llvm/CodeGen/SelectionDAG.h"
  23 #include "llvm/IR/Argument.h"
  24 #include "llvm/IR/Function.h"
  25
  26 using namespace llvm;
  27
  28 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  29     AMDGPUTargetLowering(TM),
  30     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  31   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  32   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  33   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  34   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  35   computeRegisterProperties();
  36
  37   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  38   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  40   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  41
  42   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  43   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  44
  45   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
  46   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
  47   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
  48   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
  49   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  50
  51   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  52   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  53
  54   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  55
  56   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  57   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  58   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  59
  60   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  61   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  62
  63   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  64   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  65   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  66
  67   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  68   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  69
  70   // Legalize loads and stores to the private address space.
  71   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  72   setOperationAction(ISD::LOAD, MVT::v2i32, Expand);
  73   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  74   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
  75   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
  76   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Expand);
  77   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
  78   setOperationAction(ISD::STORE, MVT::i8, Custom);
  79   setOperationAction(ISD::STORE, MVT::i32, Custom);
  80   setOperationAction(ISD::STORE, MVT::v2i32, Expand);
  81   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  82
  83   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  84   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  85   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
  86
  87   setTargetDAGCombine(ISD::FP_ROUND);
  88   setTargetDAGCombine(ISD::FP_TO_SINT);
  89   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  90   setTargetDAGCombine(ISD::SELECT_CC);
  91
  92   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
  93
  94   setBooleanContents(ZeroOrNegativeOneBooleanContent);
  95   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  96   setSchedulingPreference(Sched::VLIW);
  97 }
  98
  99 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 100     MachineInstr * MI, MachineBasicBlock * BB) const {
 101   MachineFunction * MF = BB->getParent();
 102   MachineRegisterInfo &MRI = MF->getRegInfo();
 103   MachineBasicBlock::iterator I = *MI;
 104   const R600InstrInfo *TII =
 105     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 106
 107   switch (MI->getOpcode()) {
 108   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 109   case AMDGPU::CLAMP_R600: {
 110     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 111                                                    AMDGPU::MOV,
 112                                                    MI->getOperand(0).getReg(),
 113                                                    MI->getOperand(1).getReg());
 114     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 115     break;
 116   }
 117
 118   case AMDGPU::FABS_R600: {
 119     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 120                                                     AMDGPU::MOV,
 121                                                     MI->getOperand(0).getReg(),
 122                                                     MI->getOperand(1).getReg());
 123     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 124     break;
 125   }
 126
 127   case AMDGPU::FNEG_R600: {
 128     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 129                                                     AMDGPU::MOV,
 130                                                     MI->getOperand(0).getReg(),
 131                                                     MI->getOperand(1).getReg());
 132     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 133     break;
 134   }
 135
 136   case AMDGPU::MASK_WRITE: {
 137     unsigned maskedRegister = MI->getOperand(0).getReg();
 138     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 139     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 140     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 141     break;
 142   }
 143
 144   case AMDGPU::LDS_READ_RET: {
 145     MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 146                                         TII->get(MI->getOpcode()),
 147                                         AMDGPU::OQAP);
 148     for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 149       NewMI.addOperand(MI->getOperand(i));
 150     }
 151     TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
 152                                  MI->getOperand(0).getReg(),
 153                                  AMDGPU::OQAP);
 154     break;
 155   }
 156
 157   case AMDGPU::MOV_IMM_F32:
 158     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 159                      MI->getOperand(1).getFPImm()->getValueAPF()
 160                          .bitcastToAPInt().getZExtValue());
 161     break;
 162   case AMDGPU::MOV_IMM_I32:
 163     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 164                      MI->getOperand(1).getImm());
 165     break;
 166   case AMDGPU::CONST_COPY: {
 167     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 168         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 169     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 170         MI->getOperand(1).getImm());
 171     break;
 172   }
 173
 174   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 175   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 176     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 177
 178     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 179             .addOperand(MI->getOperand(0))
 180             .addOperand(MI->getOperand(1))
 181             .addImm(EOP); // Set End of program bit
 182     break;
 183   }
 184
 185   case AMDGPU::TXD: {
 186     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 187     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 188     MachineOperand &RID = MI->getOperand(4);
 189     MachineOperand &SID = MI->getOperand(5);
 190     unsigned TextureId = MI->getOperand(6).getImm();
 191     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 192     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 193
 194     switch (TextureId) {
 195     case 5: // Rect
 196       CTX = CTY = 0;
 197       break;
 198     case 6: // Shadow1D
 199       SrcW = SrcZ;
 200       break;
 201     case 7: // Shadow2D
 202       SrcW = SrcZ;
 203       break;
 204     case 8: // ShadowRect
 205       CTX = CTY = 0;
 206       SrcW = SrcZ;
 207       break;
 208     case 9: // 1DArray
 209       SrcZ = SrcY;
 210       CTZ = 0;
 211       break;
 212     case 10: // 2DArray
 213       CTZ = 0;
 214       break;
 215     case 11: // Shadow1DArray
 216       SrcZ = SrcY;
 217       CTZ = 0;
 218       break;
 219     case 12: // Shadow2DArray
 220       CTZ = 0;
 221       break;
 222     }
 223     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 224             .addOperand(MI->getOperand(3))
 225             .addImm(SrcX)
 226             .addImm(SrcY)
 227             .addImm(SrcZ)
 228             .addImm(SrcW)
 229             .addImm(0)
 230             .addImm(0)
 231             .addImm(0)
 232             .addImm(0)
 233             .addImm(1)
 234             .addImm(2)
 235             .addImm(3)
 236             .addOperand(RID)
 237             .addOperand(SID)
 238             .addImm(CTX)
 239             .addImm(CTY)
 240             .addImm(CTZ)
 241             .addImm(CTW);
 242     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 243             .addOperand(MI->getOperand(2))
 244             .addImm(SrcX)
 245             .addImm(SrcY)
 246             .addImm(SrcZ)
 247             .addImm(SrcW)
 248             .addImm(0)
 249             .addImm(0)
 250             .addImm(0)
 251             .addImm(0)
 252             .addImm(1)
 253             .addImm(2)
 254             .addImm(3)
 255             .addOperand(RID)
 256             .addOperand(SID)
 257             .addImm(CTX)
 258             .addImm(CTY)
 259             .addImm(CTZ)
 260             .addImm(CTW);
 261     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 262             .addOperand(MI->getOperand(0))
 263             .addOperand(MI->getOperand(1))
 264             .addImm(SrcX)
 265             .addImm(SrcY)
 266             .addImm(SrcZ)
 267             .addImm(SrcW)
 268             .addImm(0)
 269             .addImm(0)
 270             .addImm(0)
 271             .addImm(0)
 272             .addImm(1)
 273             .addImm(2)
 274             .addImm(3)
 275             .addOperand(RID)
 276             .addOperand(SID)
 277             .addImm(CTX)
 278             .addImm(CTY)
 279             .addImm(CTZ)
 280             .addImm(CTW)
 281             .addReg(T0, RegState::Implicit)
 282             .addReg(T1, RegState::Implicit);
 283     break;
 284   }
 285
 286   case AMDGPU::TXD_SHADOW: {
 287     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 288     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 289     MachineOperand &RID = MI->getOperand(4);
 290     MachineOperand &SID = MI->getOperand(5);
 291     unsigned TextureId = MI->getOperand(6).getImm();
 292     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 293     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 294
 295     switch (TextureId) {
 296     case 5: // Rect
 297       CTX = CTY = 0;
 298       break;
 299     case 6: // Shadow1D
 300       SrcW = SrcZ;
 301       break;
 302     case 7: // Shadow2D
 303       SrcW = SrcZ;
 304       break;
 305     case 8: // ShadowRect
 306       CTX = CTY = 0;
 307       SrcW = SrcZ;
 308       break;
 309     case 9: // 1DArray
 310       SrcZ = SrcY;
 311       CTZ = 0;
 312       break;
 313     case 10: // 2DArray
 314       CTZ = 0;
 315       break;
 316     case 11: // Shadow1DArray
 317       SrcZ = SrcY;
 318       CTZ = 0;
 319       break;
 320     case 12: // Shadow2DArray
 321       CTZ = 0;
 322       break;
 323     }
 324
 325     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 326             .addOperand(MI->getOperand(3))
 327             .addImm(SrcX)
 328             .addImm(SrcY)
 329             .addImm(SrcZ)
 330             .addImm(SrcW)
 331             .addImm(0)
 332             .addImm(0)
 333             .addImm(0)
 334             .addImm(0)
 335             .addImm(1)
 336             .addImm(2)
 337             .addImm(3)
 338             .addOperand(RID)
 339             .addOperand(SID)
 340             .addImm(CTX)
 341             .addImm(CTY)
 342             .addImm(CTZ)
 343             .addImm(CTW);
 344     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 345             .addOperand(MI->getOperand(2))
 346             .addImm(SrcX)
 347             .addImm(SrcY)
 348             .addImm(SrcZ)
 349             .addImm(SrcW)
 350             .addImm(0)
 351             .addImm(0)
 352             .addImm(0)
 353             .addImm(0)
 354             .addImm(1)
 355             .addImm(2)
 356             .addImm(3)
 357             .addOperand(RID)
 358             .addOperand(SID)
 359             .addImm(CTX)
 360             .addImm(CTY)
 361             .addImm(CTZ)
 362             .addImm(CTW);
 363     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 364             .addOperand(MI->getOperand(0))
 365             .addOperand(MI->getOperand(1))
 366             .addImm(SrcX)
 367             .addImm(SrcY)
 368             .addImm(SrcZ)
 369             .addImm(SrcW)
 370             .addImm(0)
 371             .addImm(0)
 372             .addImm(0)
 373             .addImm(0)
 374             .addImm(1)
 375             .addImm(2)
 376             .addImm(3)
 377             .addOperand(RID)
 378             .addOperand(SID)
 379             .addImm(CTX)
 380             .addImm(CTY)
 381             .addImm(CTZ)
 382             .addImm(CTW)
 383             .addReg(T0, RegState::Implicit)
 384             .addReg(T1, RegState::Implicit);
 385     break;
 386   }
 387
 388   case AMDGPU::BRANCH:
 389       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 390               .addOperand(MI->getOperand(0));
 391       break;
 392
 393   case AMDGPU::BRANCH_COND_f32: {
 394     MachineInstr *NewMI =
 395       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 396               AMDGPU::PREDICATE_BIT)
 397               .addOperand(MI->getOperand(1))
 398               .addImm(OPCODE_IS_NOT_ZERO)
 399               .addImm(0); // Flags
 400     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 401     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 402             .addOperand(MI->getOperand(0))
 403             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 404     break;
 405   }
 406
 407   case AMDGPU::BRANCH_COND_i32: {
 408     MachineInstr *NewMI =
 409       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 410             AMDGPU::PREDICATE_BIT)
 411             .addOperand(MI->getOperand(1))
 412             .addImm(OPCODE_IS_NOT_ZERO_INT)
 413             .addImm(0); // Flags
 414     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 415     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 416            .addOperand(MI->getOperand(0))
 417             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 418     break;
 419   }
 420
 421   case AMDGPU::EG_ExportSwz:
 422   case AMDGPU::R600_ExportSwz: {
 423     // Instruction is left unmodified if its not the last one of its type
 424     bool isLastInstructionOfItsType = true;
 425     unsigned InstExportType = MI->getOperand(1).getImm();
 426     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 427          EndBlock = BB->end(); NextExportInst != EndBlock;
 428          NextExportInst = llvm::next(NextExportInst)) {
 429       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 430           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 431         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 432             .getImm();
 433         if (CurrentInstExportType == InstExportType) {
 434           isLastInstructionOfItsType = false;
 435           break;
 436         }
 437       }
 438     }
 439     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 440     if (!EOP && !isLastInstructionOfItsType)
 441       return BB;
 442     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 443     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 444             .addOperand(MI->getOperand(0))
 445             .addOperand(MI->getOperand(1))
 446             .addOperand(MI->getOperand(2))
 447             .addOperand(MI->getOperand(3))
 448             .addOperand(MI->getOperand(4))
 449             .addOperand(MI->getOperand(5))
 450             .addOperand(MI->getOperand(6))
 451             .addImm(CfInst)
 452             .addImm(EOP);
 453     break;
 454   }
 455   case AMDGPU::RETURN: {
 456     // RETURN instructions must have the live-out registers as implicit uses,
 457     // otherwise they appear dead.
 458     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 459     MachineInstrBuilder MIB(*MF, MI);
 460     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 461       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 462     return BB;
 463   }
 464   }
 465
 466   MI->eraseFromParent();
 467   return BB;
 468 }
 469
 470 //===----------------------------------------------------------------------===//
 471 // Custom DAG Lowering Operations
 472 //===----------------------------------------------------------------------===//
 473
 474 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 475   MachineFunction &MF = DAG.getMachineFunction();
 476   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 477   switch (Op.getOpcode()) {
 478   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 479   case ISD::FCOS:
 480   case ISD::FSIN: return LowerTrig(Op, DAG);
 481   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 482   case ISD::SELECT: return LowerSELECT(Op, DAG);
 483   case ISD::STORE: return LowerSTORE(Op, DAG);
 484   case ISD::LOAD: return LowerLOAD(Op, DAG);
 485   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 486   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 487   case ISD::INTRINSIC_VOID: {
 488     SDValue Chain = Op.getOperand(0);
 489     unsigned IntrinsicID =
 490                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 491     switch (IntrinsicID) {
 492     case AMDGPUIntrinsic::AMDGPU_store_output: {
 493       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 494       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 495       MFI->LiveOuts.push_back(Reg);
 496       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 497     }
 498     case AMDGPUIntrinsic::R600_store_swizzle: {
 499       const SDValue Args[8] = {
 500         Chain,
 501         Op.getOperand(2), // Export Value
 502         Op.getOperand(3), // ArrayBase
 503         Op.getOperand(4), // Type
 504         DAG.getConstant(0, MVT::i32), // SWZ_X
 505         DAG.getConstant(1, MVT::i32), // SWZ_Y
 506         DAG.getConstant(2, MVT::i32), // SWZ_Z
 507         DAG.getConstant(3, MVT::i32) // SWZ_W
 508       };
 509       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 510           Args, 8);
 511     }
 512
 513     // default for switch(IntrinsicID)
 514     default: break;
 515     }
 516     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 517     break;
 518   }
 519   case ISD::INTRINSIC_WO_CHAIN: {
 520     unsigned IntrinsicID =
 521                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 522     EVT VT = Op.getValueType();
 523     SDLoc DL(Op);
 524     switch(IntrinsicID) {
 525     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 526     case AMDGPUIntrinsic::R600_load_input: {
 527       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 528       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 529       MachineFunction &MF = DAG.getMachineFunction();
 530       MachineRegisterInfo &MRI = MF.getRegInfo();
 531       MRI.addLiveIn(Reg);
 532       return DAG.getCopyFromReg(DAG.getEntryNode(),
 533           SDLoc(DAG.getEntryNode()), Reg, VT);
 534     }
 535
 536     case AMDGPUIntrinsic::R600_interp_input: {
 537       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 538       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 539       MachineSDNode *interp;
 540       if (ijb < 0) {
 541         const MachineFunction &MF = DAG.getMachineFunction();
 542         const R600InstrInfo *TII =
 543           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 544         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 545             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 546         return DAG.getTargetExtractSubreg(
 547             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 548             DL, MVT::f32, SDValue(interp, 0));
 549       }
 550
 551       MachineFunction &MF = DAG.getMachineFunction();
 552       MachineRegisterInfo &MRI = MF.getRegInfo();
 553       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 554       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 555       MRI.addLiveIn(RegisterI);
 556       MRI.addLiveIn(RegisterJ);
 557       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 558           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 559       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 560           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 561
 562       if (slot % 4 < 2)
 563         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 564             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 565             RegisterJNode, RegisterINode);
 566       else
 567         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 568             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 569             RegisterJNode, RegisterINode);
 570       return SDValue(interp, slot % 2);
 571     }
 572     case AMDGPUIntrinsic::R600_tex:
 573     case AMDGPUIntrinsic::R600_texc:
 574     case AMDGPUIntrinsic::R600_txl:
 575     case AMDGPUIntrinsic::R600_txlc:
 576     case AMDGPUIntrinsic::R600_txb:
 577     case AMDGPUIntrinsic::R600_txbc:
 578     case AMDGPUIntrinsic::R600_txf:
 579     case AMDGPUIntrinsic::R600_txq:
 580     case AMDGPUIntrinsic::R600_ddx:
 581     case AMDGPUIntrinsic::R600_ddy: {
 582       unsigned TextureOp;
 583       switch (IntrinsicID) {
 584       case AMDGPUIntrinsic::R600_tex:
 585         TextureOp = 0;
 586         break;
 587       case AMDGPUIntrinsic::R600_texc:
 588         TextureOp = 1;
 589         break;
 590       case AMDGPUIntrinsic::R600_txl:
 591         TextureOp = 2;
 592         break;
 593       case AMDGPUIntrinsic::R600_txlc:
 594         TextureOp = 3;
 595         break;
 596       case AMDGPUIntrinsic::R600_txb:
 597         TextureOp = 4;
 598         break;
 599       case AMDGPUIntrinsic::R600_txbc:
 600         TextureOp = 5;
 601         break;
 602       case AMDGPUIntrinsic::R600_txf:
 603         TextureOp = 6;
 604         break;
 605       case AMDGPUIntrinsic::R600_txq:
 606         TextureOp = 7;
 607         break;
 608       case AMDGPUIntrinsic::R600_ddx:
 609         TextureOp = 8;
 610         break;
 611       case AMDGPUIntrinsic::R600_ddy:
 612         TextureOp = 9;
 613         break;
 614       default:
 615         llvm_unreachable("Unknow Texture Operation");
 616       }
 617
 618       SDValue TexArgs[19] = {
 619         DAG.getConstant(TextureOp, MVT::i32),
 620         Op.getOperand(1),
 621         DAG.getConstant(0, MVT::i32),
 622         DAG.getConstant(1, MVT::i32),
 623         DAG.getConstant(2, MVT::i32),
 624         DAG.getConstant(3, MVT::i32),
 625         Op.getOperand(2),
 626         Op.getOperand(3),
 627         Op.getOperand(4),
 628         DAG.getConstant(0, MVT::i32),
 629         DAG.getConstant(1, MVT::i32),
 630         DAG.getConstant(2, MVT::i32),
 631         DAG.getConstant(3, MVT::i32),
 632         Op.getOperand(5),
 633         Op.getOperand(6),
 634         Op.getOperand(7),
 635         Op.getOperand(8),
 636         Op.getOperand(9),
 637         Op.getOperand(10)
 638       };
 639       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 640     }
 641     case AMDGPUIntrinsic::AMDGPU_dp4: {
 642       SDValue Args[8] = {
 643       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 644           DAG.getConstant(0, MVT::i32)),
 645       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 646           DAG.getConstant(0, MVT::i32)),
 647       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 648           DAG.getConstant(1, MVT::i32)),
 649       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 650           DAG.getConstant(1, MVT::i32)),
 651       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 652           DAG.getConstant(2, MVT::i32)),
 653       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 654           DAG.getConstant(2, MVT::i32)),
 655       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 656           DAG.getConstant(3, MVT::i32)),
 657       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 658           DAG.getConstant(3, MVT::i32))
 659       };
 660       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 661     }
 662
 663     case Intrinsic::r600_read_ngroups_x:
 664       return LowerImplicitParameter(DAG, VT, DL, 0);
 665     case Intrinsic::r600_read_ngroups_y:
 666       return LowerImplicitParameter(DAG, VT, DL, 1);
 667     case Intrinsic::r600_read_ngroups_z:
 668       return LowerImplicitParameter(DAG, VT, DL, 2);
 669     case Intrinsic::r600_read_global_size_x:
 670       return LowerImplicitParameter(DAG, VT, DL, 3);
 671     case Intrinsic::r600_read_global_size_y:
 672       return LowerImplicitParameter(DAG, VT, DL, 4);
 673     case Intrinsic::r600_read_global_size_z:
 674       return LowerImplicitParameter(DAG, VT, DL, 5);
 675     case Intrinsic::r600_read_local_size_x:
 676       return LowerImplicitParameter(DAG, VT, DL, 6);
 677     case Intrinsic::r600_read_local_size_y:
 678       return LowerImplicitParameter(DAG, VT, DL, 7);
 679     case Intrinsic::r600_read_local_size_z:
 680       return LowerImplicitParameter(DAG, VT, DL, 8);
 681
 682     case Intrinsic::r600_read_tgid_x:
 683       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 684                                   AMDGPU::T1_X, VT);
 685     case Intrinsic::r600_read_tgid_y:
 686       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 687                                   AMDGPU::T1_Y, VT);
 688     case Intrinsic::r600_read_tgid_z:
 689       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 690                                   AMDGPU::T1_Z, VT);
 691     case Intrinsic::r600_read_tidig_x:
 692       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 693                                   AMDGPU::T0_X, VT);
 694     case Intrinsic::r600_read_tidig_y:
 695       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 696                                   AMDGPU::T0_Y, VT);
 697     case Intrinsic::r600_read_tidig_z:
 698       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 699                                   AMDGPU::T0_Z, VT);
 700     }
 701     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 702     break;
 703   }
 704   } // end switch(Op.getOpcode())
 705   return SDValue();
 706 }
 707
 708 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 709                                             SmallVectorImpl<SDValue> &Results,
 710                                             SelectionDAG &DAG) const {
 711   switch (N->getOpcode()) {
 712   default: return;
 713   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 714     return;
 715   case ISD::LOAD: {
 716     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 717     Results.push_back(SDValue(Node, 0));
 718     Results.push_back(SDValue(Node, 1));
 719     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 720     // function
 721     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 722     return;
 723   }
 724   case ISD::STORE:
 725     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 726     Results.push_back(SDValue(Node, 0));
 727     return;
 728   }
 729 }
 730
 731 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 732   // On hw >= R700, COS/SIN input must be between -1. and 1.
 733   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 734   EVT VT = Op.getValueType();
 735   SDValue Arg = Op.getOperand(0);
 736   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 737       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 738         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 739           DAG.getConstantFP(0.15915494309, MVT::f32)),
 740         DAG.getConstantFP(0.5, MVT::f32)));
 741   unsigned TrigNode;
 742   switch (Op.getOpcode()) {
 743   case ISD::FCOS:
 744     TrigNode = AMDGPUISD::COS_HW;
 745     break;
 746   case ISD::FSIN:
 747     TrigNode = AMDGPUISD::SIN_HW;
 748     break;
 749   default:
 750     llvm_unreachable("Wrong trig opcode");
 751   }
 752   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 753       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 754         DAG.getConstantFP(-0.5, MVT::f32)));
 755   if (Gen >= AMDGPUSubtarget::R700)
 756     return TrigVal;
 757   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 758   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 759       DAG.getConstantFP(3.14159265359, MVT::f32));
 760 }
 761
 762 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 763   return DAG.getNode(
 764       ISD::SETCC,
 765       SDLoc(Op),
 766       MVT::i1,
 767       Op, DAG.getConstantFP(0.0f, MVT::f32),
 768       DAG.getCondCode(ISD::SETNE)
 769       );
 770 }
 771
 772 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 773                                                    SDLoc DL,
 774                                                    unsigned DwordOffset) const {
 775   unsigned ByteOffset = DwordOffset * 4;
 776   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 777                                       AMDGPUAS::PARAM_I_ADDRESS);
 778
 779   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 780   assert(isInt<16>(ByteOffset));
 781
 782   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 783                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 784                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 785                      false, false, false, 0);
 786 }
 787
 788 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 789
 790   MachineFunction &MF = DAG.getMachineFunction();
 791   const AMDGPUFrameLowering *TFL =
 792    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 793
 794   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 795   assert(FIN);
 796
 797   unsigned FrameIndex = FIN->getIndex();
 798   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 799   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 800 }
 801
 802 bool R600TargetLowering::isZero(SDValue Op) const {
 803   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 804     return Cst->isNullValue();
 805   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 806     return CstFP->isZero();
 807   } else {
 808     return false;
 809   }
 810 }
 811
 812 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 813   SDLoc DL(Op);
 814   EVT VT = Op.getValueType();
 815
 816   SDValue LHS = Op.getOperand(0);
 817   SDValue RHS = Op.getOperand(1);
 818   SDValue True = Op.getOperand(2);
 819   SDValue False = Op.getOperand(3);
 820   SDValue CC = Op.getOperand(4);
 821   SDValue Temp;
 822
 823   // LHS and RHS are guaranteed to be the same value type
 824   EVT CompareVT = LHS.getValueType();
 825
 826   // Check if we can lower this to a native operation.
 827
 828   // Try to lower to a SET* instruction:
 829   //
 830   // SET* can match the following patterns:
 831   //
 832   // select_cc f32, f32, -1,  0, cc_any
 833   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 834   // select_cc i32, i32, -1,  0, cc_any
 835   //
 836
 837   // Move hardware True/False values to the correct operand.
 838   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 839     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 840     std::swap(False, True);
 841     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 842   }
 843
 844   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 845       (CompareVT == VT || VT == MVT::i32)) {
 846     // This can be matched by a SET* instruction.
 847     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 848   }
 849
 850   // Try to lower to a CND* instruction:
 851   //
 852   // CND* can match the following patterns:
 853   //
 854   // select_cc f32, 0.0, f32, f32, cc_any
 855   // select_cc f32, 0.0, i32, i32, cc_any
 856   // select_cc i32, 0,   f32, f32, cc_any
 857   // select_cc i32, 0,   i32, i32, cc_any
 858   //
 859   if (isZero(LHS) || isZero(RHS)) {
 860     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 861     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 862     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 863     if (CompareVT != VT) {
 864       // Bitcast True / False to the correct types.  This will end up being
 865       // a nop, but it allows us to define only a single pattern in the
 866       // .TD files for each CND* instruction rather than having to have
 867       // one pattern for integer True/False and one for fp True/False
 868       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 869       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 870     }
 871     if (isZero(LHS)) {
 872       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 873     }
 874
 875     switch (CCOpcode) {
 876     case ISD::SETONE:
 877     case ISD::SETUNE:
 878     case ISD::SETNE:
 879     case ISD::SETULE:
 880     case ISD::SETULT:
 881     case ISD::SETOLE:
 882     case ISD::SETOLT:
 883     case ISD::SETLE:
 884     case ISD::SETLT:
 885       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 886       Temp = True;
 887       True = False;
 888       False = Temp;
 889       break;
 890     default:
 891       break;
 892     }
 893     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 894         Cond, Zero,
 895         True, False,
 896         DAG.getCondCode(CCOpcode));
 897     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 898   }
 899
 900
 901   // Possible Min/Max pattern
 902   SDValue MinMax = LowerMinMax(Op, DAG);
 903   if (MinMax.getNode()) {
 904     return MinMax;
 905   }
 906
 907   // If we make it this for it means we have no native instructions to handle
 908   // this SELECT_CC, so we must lower it.
 909   SDValue HWTrue, HWFalse;
 910
 911   if (CompareVT == MVT::f32) {
 912     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 913     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 914   } else if (CompareVT == MVT::i32) {
 915     HWTrue = DAG.getConstant(-1, CompareVT);
 916     HWFalse = DAG.getConstant(0, CompareVT);
 917   }
 918   else {
 919     assert(!"Unhandled value type in LowerSELECT_CC");
 920   }
 921
 922   // Lower this unsupported SELECT_CC into a combination of two supported
 923   // SELECT_CC operations.
 924   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 925
 926   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 927       Cond, HWFalse,
 928       True, False,
 929       DAG.getCondCode(ISD::SETNE));
 930 }
 931
 932 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 933   return DAG.getNode(ISD::SELECT_CC,
 934       SDLoc(Op),
 935       Op.getValueType(),
 936       Op.getOperand(0),
 937       DAG.getConstant(0, MVT::i32),
 938       Op.getOperand(1),
 939       Op.getOperand(2),
 940       DAG.getCondCode(ISD::SETNE));
 941 }
 942
 943 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 944 /// convert these pointers to a register index.  Each register holds
 945 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 946 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 947 /// for indirect addressing.
 948 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 949                                                unsigned StackWidth,
 950                                                SelectionDAG &DAG) const {
 951   unsigned SRLPad;
 952   switch(StackWidth) {
 953   case 1:
 954     SRLPad = 2;
 955     break;
 956   case 2:
 957     SRLPad = 3;
 958     break;
 959   case 4:
 960     SRLPad = 4;
 961     break;
 962   default: llvm_unreachable("Invalid stack width");
 963   }
 964
 965   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
 966                      DAG.getConstant(SRLPad, MVT::i32));
 967 }
 968
 969 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 970                                          unsigned ElemIdx,
 971                                          unsigned &Channel,
 972                                          unsigned &PtrIncr) const {
 973   switch (StackWidth) {
 974   default:
 975   case 1:
 976     Channel = 0;
 977     if (ElemIdx > 0) {
 978       PtrIncr = 1;
 979     } else {
 980       PtrIncr = 0;
 981     }
 982     break;
 983   case 2:
 984     Channel = ElemIdx % 2;
 985     if (ElemIdx == 2) {
 986       PtrIncr = 1;
 987     } else {
 988       PtrIncr = 0;
 989     }
 990     break;
 991   case 4:
 992     Channel = ElemIdx;
 993     PtrIncr = 0;
 994     break;
 995   }
 996 }
 997
 998 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 999   SDLoc DL(Op);
1000   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1001   SDValue Chain = Op.getOperand(0);
1002   SDValue Value = Op.getOperand(1);
1003   SDValue Ptr = Op.getOperand(2);
1004
1005   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
1006       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
1007     // Convert pointer from byte address to dword address.
1008     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1009                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1010                                   Ptr, DAG.getConstant(2, MVT::i32)));
1011
1012     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1013       assert(!"Truncated and indexed stores not supported yet");
1014     } else {
1015       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1016     }
1017     return Chain;
1018   }
1019
1020   EVT ValueVT = Value.getValueType();
1021
1022   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1023     return SDValue();
1024   }
1025
1026   // Lowering for indirect addressing
1027
1028   const MachineFunction &MF = DAG.getMachineFunction();
1029   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1030                                          getTargetMachine().getFrameLowering());
1031   unsigned StackWidth = TFL->getStackWidth(MF);
1032
1033   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1034
1035   if (ValueVT.isVector()) {
1036     unsigned NumElemVT = ValueVT.getVectorNumElements();
1037     EVT ElemVT = ValueVT.getVectorElementType();
1038     SDValue Stores[4];
1039
1040     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1041                                       "vector width in load");
1042
1043     for (unsigned i = 0; i < NumElemVT; ++i) {
1044       unsigned Channel, PtrIncr;
1045       getStackAddress(StackWidth, i, Channel, PtrIncr);
1046       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1047                         DAG.getConstant(PtrIncr, MVT::i32));
1048       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1049                                  Value, DAG.getConstant(i, MVT::i32));
1050
1051       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1052                               Chain, Elem, Ptr,
1053                               DAG.getTargetConstant(Channel, MVT::i32));
1054     }
1055      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1056    } else {
1057     if (ValueVT == MVT::i8) {
1058       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1059     }
1060     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1061     DAG.getTargetConstant(0, MVT::i32)); // Channel
1062   }
1063
1064   return Chain;
1065 }
1066
1067 // return (512 + (kc_bank << 12)
1068 static int
1069 ConstantAddressBlock(unsigned AddressSpace) {
1070   switch (AddressSpace) {
1071   case AMDGPUAS::CONSTANT_BUFFER_0:
1072     return 512;
1073   case AMDGPUAS::CONSTANT_BUFFER_1:
1074     return 512 + 4096;
1075   case AMDGPUAS::CONSTANT_BUFFER_2:
1076     return 512 + 4096 * 2;
1077   case AMDGPUAS::CONSTANT_BUFFER_3:
1078     return 512 + 4096 * 3;
1079   case AMDGPUAS::CONSTANT_BUFFER_4:
1080     return 512 + 4096 * 4;
1081   case AMDGPUAS::CONSTANT_BUFFER_5:
1082     return 512 + 4096 * 5;
1083   case AMDGPUAS::CONSTANT_BUFFER_6:
1084     return 512 + 4096 * 6;
1085   case AMDGPUAS::CONSTANT_BUFFER_7:
1086     return 512 + 4096 * 7;
1087   case AMDGPUAS::CONSTANT_BUFFER_8:
1088     return 512 + 4096 * 8;
1089   case AMDGPUAS::CONSTANT_BUFFER_9:
1090     return 512 + 4096 * 9;
1091   case AMDGPUAS::CONSTANT_BUFFER_10:
1092     return 512 + 4096 * 10;
1093   case AMDGPUAS::CONSTANT_BUFFER_11:
1094     return 512 + 4096 * 11;
1095   case AMDGPUAS::CONSTANT_BUFFER_12:
1096     return 512 + 4096 * 12;
1097   case AMDGPUAS::CONSTANT_BUFFER_13:
1098     return 512 + 4096 * 13;
1099   case AMDGPUAS::CONSTANT_BUFFER_14:
1100     return 512 + 4096 * 14;
1101   case AMDGPUAS::CONSTANT_BUFFER_15:
1102     return 512 + 4096 * 15;
1103   default:
1104     return -1;
1105   }
1106 }
1107
1108 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1109 {
1110   EVT VT = Op.getValueType();
1111   SDLoc DL(Op);
1112   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1113   SDValue Chain = Op.getOperand(0);
1114   SDValue Ptr = Op.getOperand(1);
1115   SDValue LoweredLoad;
1116
1117   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1118   if (ConstantBlock > -1) {
1119     SDValue Result;
1120     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1121         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1122         dyn_cast<ConstantSDNode>(Ptr)) {
1123       SDValue Slots[4];
1124       for (unsigned i = 0; i < 4; i++) {
1125         // We want Const position encoded with the following formula :
1126         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1127         // const_index is Ptr computed by llvm using an alignment of 16.
1128         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1129         // then div by 4 at the ISel step
1130         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1131             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1132         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1133       }
1134       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
1135     } else {
1136       // non constant ptr cant be folded, keeps it as a v4f32 load
1137       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1138           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1139           DAG.getConstant(LoadNode->getAddressSpace() -
1140                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1141           );
1142     }
1143
1144     if (!VT.isVector()) {
1145       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1146           DAG.getConstant(0, MVT::i32));
1147     }
1148
1149     SDValue MergedValues[2] = {
1150         Result,
1151         Chain
1152     };
1153     return DAG.getMergeValues(MergedValues, 2, DL);
1154   }
1155
1156   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1157     return SDValue();
1158   }
1159
1160   // Lowering for indirect addressing
1161   const MachineFunction &MF = DAG.getMachineFunction();
1162   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1163                                          getTargetMachine().getFrameLowering());
1164   unsigned StackWidth = TFL->getStackWidth(MF);
1165
1166   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1167
1168   if (VT.isVector()) {
1169     unsigned NumElemVT = VT.getVectorNumElements();
1170     EVT ElemVT = VT.getVectorElementType();
1171     SDValue Loads[4];
1172
1173     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1174                                       "vector width in load");
1175
1176     for (unsigned i = 0; i < NumElemVT; ++i) {
1177       unsigned Channel, PtrIncr;
1178       getStackAddress(StackWidth, i, Channel, PtrIncr);
1179       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1180                         DAG.getConstant(PtrIncr, MVT::i32));
1181       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1182                              Chain, Ptr,
1183                              DAG.getTargetConstant(Channel, MVT::i32),
1184                              Op.getOperand(2));
1185     }
1186     for (unsigned i = NumElemVT; i < 4; ++i) {
1187       Loads[i] = DAG.getUNDEF(ElemVT);
1188     }
1189     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1190     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1191   } else {
1192     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1193                               Chain, Ptr,
1194                               DAG.getTargetConstant(0, MVT::i32), // Channel
1195                               Op.getOperand(2));
1196   }
1197
1198   SDValue Ops[2];
1199   Ops[0] = LoweredLoad;
1200   Ops[1] = Chain;
1201
1202   return DAG.getMergeValues(Ops, 2, DL);
1203 }
1204
1205 /// XXX Only kernel functions are supported, so we can assume for now that
1206 /// every function is a kernel function, but in the future we should use
1207 /// separate calling conventions for kernel and non-kernel functions.
1208 SDValue R600TargetLowering::LowerFormalArguments(
1209                                       SDValue Chain,
1210                                       CallingConv::ID CallConv,
1211                                       bool isVarArg,
1212                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1213                                       SDLoc DL, SelectionDAG &DAG,
1214                                       SmallVectorImpl<SDValue> &InVals) const {
1215   unsigned ParamOffsetBytes = 36;
1216   Function::const_arg_iterator FuncArg =
1217                             DAG.getMachineFunction().getFunction()->arg_begin();
1218   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
1219     EVT VT = Ins[i].VT;
1220     Type *ArgType = FuncArg->getType();
1221     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
1222                              32 : ArgType->getPrimitiveSizeInBits();
1223     unsigned ArgBytes = ArgSizeInBits >> 3;
1224     EVT ArgVT;
1225     if (ArgSizeInBits < VT.getSizeInBits()) {
1226       assert(!ArgType->isFloatTy() &&
1227              "Extending floating point arguments not supported yet");
1228       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
1229     } else {
1230       ArgVT = VT;
1231     }
1232
1233     ISD::LoadExtType LoadType = ISD::EXTLOAD;
1234     if (Ins[i].Flags.isZExt()) {
1235       LoadType = ISD::ZEXTLOAD;
1236     } else if (Ins[i].Flags.isSExt()) {
1237       LoadType = ISD::SEXTLOAD;
1238     }
1239
1240     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1241                                                     AMDGPUAS::PARAM_I_ADDRESS);
1242     SDValue Arg = DAG.getExtLoad(LoadType, DL, VT, DAG.getRoot(),
1243                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
1244                                        MachinePointerInfo(UndefValue::get(PtrTy)),
1245                                        ArgVT, false, false, ArgBytes);
1246     InVals.push_back(Arg);
1247     ParamOffsetBytes += ArgBytes;
1248   }
1249   return Chain;
1250 }
1251
1252 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1253    if (!VT.isVector()) return MVT::i32;
1254    return VT.changeVectorElementTypeToInteger();
1255 }
1256
1257 static SDValue
1258 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1259                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1260   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1261   assert(RemapSwizzle.empty());
1262   SDValue NewBldVec[4] = {
1263       VectorEntry.getOperand(0),
1264       VectorEntry.getOperand(1),
1265       VectorEntry.getOperand(2),
1266       VectorEntry.getOperand(3)
1267   };
1268
1269   for (unsigned i = 0; i < 4; i++) {
1270     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1271       if (C->isZero()) {
1272         RemapSwizzle[i] = 4; // SEL_0
1273         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1274       } else if (C->isExactlyValue(1.0)) {
1275         RemapSwizzle[i] = 5; // SEL_1
1276         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1277       }
1278     }
1279
1280     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1281       continue;
1282     for (unsigned j = 0; j < i; j++) {
1283       if (NewBldVec[i] == NewBldVec[j]) {
1284         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1285         RemapSwizzle[i] = j;
1286         break;
1287       }
1288     }
1289   }
1290
1291   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1292       VectorEntry.getValueType(), NewBldVec, 4);
1293 }
1294
1295 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1296                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1297   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1298   assert(RemapSwizzle.empty());
1299   SDValue NewBldVec[4] = {
1300       VectorEntry.getOperand(0),
1301       VectorEntry.getOperand(1),
1302       VectorEntry.getOperand(2),
1303       VectorEntry.getOperand(3)
1304   };
1305   bool isUnmovable[4] = { false, false, false, false };
1306   for (unsigned i = 0; i < 4; i++)
1307     RemapSwizzle[i] = i;
1308
1309   for (unsigned i = 0; i < 4; i++) {
1310     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1311       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1312           ->getZExtValue();
1313       if (!isUnmovable[Idx]) {
1314         // Swap i and Idx
1315         std::swap(NewBldVec[Idx], NewBldVec[i]);
1316         std::swap(RemapSwizzle[RemapSwizzle[Idx]], RemapSwizzle[RemapSwizzle[i]]);
1317       }
1318       isUnmovable[Idx] = true;
1319     }
1320   }
1321
1322   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1323       VectorEntry.getValueType(), NewBldVec, 4);
1324 }
1325
1326
1327 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1328 SDValue Swz[4], SelectionDAG &DAG) const {
1329   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1330   // Old -> New swizzle values
1331   DenseMap<unsigned, unsigned> SwizzleRemap;
1332
1333   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1334   for (unsigned i = 0; i < 4; i++) {
1335     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1336     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1337       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1338   }
1339
1340   SwizzleRemap.clear();
1341   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1342   for (unsigned i = 0; i < 4; i++) {
1343     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1344     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1345       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1346   }
1347
1348   return BuildVector;
1349 }
1350
1351
1352 //===----------------------------------------------------------------------===//
1353 // Custom DAG Optimizations
1354 //===----------------------------------------------------------------------===//
1355
1356 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1357                                               DAGCombinerInfo &DCI) const {
1358   SelectionDAG &DAG = DCI.DAG;
1359
1360   switch (N->getOpcode()) {
1361   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1362   case ISD::FP_ROUND: {
1363       SDValue Arg = N->getOperand(0);
1364       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1365         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1366                            Arg.getOperand(0));
1367       }
1368       break;
1369     }
1370
1371   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1372   // (i32 select_cc f32, f32, -1, 0 cc)
1373   //
1374   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1375   // this to one of the SET*_DX10 instructions.
1376   case ISD::FP_TO_SINT: {
1377     SDValue FNeg = N->getOperand(0);
1378     if (FNeg.getOpcode() != ISD::FNEG) {
1379       return SDValue();
1380     }
1381     SDValue SelectCC = FNeg.getOperand(0);
1382     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1383         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1384         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1385         !isHWTrueValue(SelectCC.getOperand(2)) ||
1386         !isHWFalseValue(SelectCC.getOperand(3))) {
1387       return SDValue();
1388     }
1389
1390     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1391                            SelectCC.getOperand(0), // LHS
1392                            SelectCC.getOperand(1), // RHS
1393                            DAG.getConstant(-1, MVT::i32), // True
1394                            DAG.getConstant(0, MVT::i32),  // Flase
1395                            SelectCC.getOperand(4)); // CC
1396
1397     break;
1398   }
1399   // Extract_vec (Build_vector) generated by custom lowering
1400   // also needs to be customly combined
1401   case ISD::EXTRACT_VECTOR_ELT: {
1402     SDValue Arg = N->getOperand(0);
1403     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1404       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1405         unsigned Element = Const->getZExtValue();
1406         return Arg->getOperand(Element);
1407       }
1408     }
1409     if (Arg.getOpcode() == ISD::BITCAST &&
1410         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1411       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1412         unsigned Element = Const->getZExtValue();
1413         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1414             Arg->getOperand(0).getOperand(Element));
1415       }
1416     }
1417   }
1418
1419   case ISD::SELECT_CC: {
1420     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1421     //      selectcc x, y, a, b, inv(cc)
1422     //
1423     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1424     //      selectcc x, y, a, b, cc
1425     SDValue LHS = N->getOperand(0);
1426     if (LHS.getOpcode() != ISD::SELECT_CC) {
1427       return SDValue();
1428     }
1429
1430     SDValue RHS = N->getOperand(1);
1431     SDValue True = N->getOperand(2);
1432     SDValue False = N->getOperand(3);
1433     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1434
1435     if (LHS.getOperand(2).getNode() != True.getNode() ||
1436         LHS.getOperand(3).getNode() != False.getNode() ||
1437         RHS.getNode() != False.getNode()) {
1438       return SDValue();
1439     }
1440
1441     switch (NCC) {
1442     default: return SDValue();
1443     case ISD::SETNE: return LHS;
1444     case ISD::SETEQ: {
1445       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1446       LHSCC = ISD::getSetCCInverse(LHSCC,
1447                                   LHS.getOperand(0).getValueType().isInteger());
1448       return DAG.getSelectCC(SDLoc(N),
1449                              LHS.getOperand(0),
1450                              LHS.getOperand(1),
1451                              LHS.getOperand(2),
1452                              LHS.getOperand(3),
1453                              LHSCC);
1454     }
1455     }
1456   }
1457   case AMDGPUISD::EXPORT: {
1458     SDValue Arg = N->getOperand(1);
1459     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1460       break;
1461
1462     SDValue NewArgs[8] = {
1463       N->getOperand(0), // Chain
1464       SDValue(),
1465       N->getOperand(2), // ArrayBase
1466       N->getOperand(3), // Type
1467       N->getOperand(4), // SWZ_X
1468       N->getOperand(5), // SWZ_Y
1469       N->getOperand(6), // SWZ_Z
1470       N->getOperand(7) // SWZ_W
1471     };
1472     SDLoc DL(N);
1473     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1474     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1475   }
1476   case AMDGPUISD::TEXTURE_FETCH: {
1477     SDValue Arg = N->getOperand(1);
1478     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1479       break;
1480
1481     SDValue NewArgs[19] = {
1482       N->getOperand(0),
1483       N->getOperand(1),
1484       N->getOperand(2),
1485       N->getOperand(3),
1486       N->getOperand(4),
1487       N->getOperand(5),
1488       N->getOperand(6),
1489       N->getOperand(7),
1490       N->getOperand(8),
1491       N->getOperand(9),
1492       N->getOperand(10),
1493       N->getOperand(11),
1494       N->getOperand(12),
1495       N->getOperand(13),
1496       N->getOperand(14),
1497       N->getOperand(15),
1498       N->getOperand(16),
1499       N->getOperand(17),
1500       N->getOperand(18),
1501     };
1502     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1503     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1504         NewArgs, 19);
1505   }
1506   }
1507   return SDValue();
1508 }