lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/CallingConvLower.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24 #include "llvm/IR/Argument.h"
  25 #include "llvm/IR/Function.h"
  26
  27 using namespace llvm;
  28
  29 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  30     AMDGPUTargetLowering(TM),
  31     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  36   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  37   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  38
  39   computeRegisterProperties();
  40
  41   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  42   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  43
  44   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  45   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  46
  47   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  48   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  49
  50   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  51
  52   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  53   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  54   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  55
  56   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  57   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  58
  59   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  60   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  61   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  62
  63   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  64   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  65   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  66   setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
  67   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
  69
  70   // Legalize loads and stores to the private address space.
  71   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  72   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  73   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  74   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
  75   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
  76   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  77   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
  78   setOperationAction(ISD::STORE, MVT::i8, Custom);
  79   setOperationAction(ISD::STORE, MVT::i32, Custom);
  80   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  81   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  82   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
  83   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
  84
  85   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  86   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  87   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
  88
  89   setTargetDAGCombine(ISD::FP_ROUND);
  90   setTargetDAGCombine(ISD::FP_TO_SINT);
  91   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  92   setTargetDAGCombine(ISD::SELECT_CC);
  93   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
  94
  95   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
  96
  97   setBooleanContents(ZeroOrNegativeOneBooleanContent);
  98   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  99   setSchedulingPreference(Sched::Source);
 100 }
 101
 102 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 103     MachineInstr * MI, MachineBasicBlock * BB) const {
 104   MachineFunction * MF = BB->getParent();
 105   MachineRegisterInfo &MRI = MF->getRegInfo();
 106   MachineBasicBlock::iterator I = *MI;
 107   const R600InstrInfo *TII =
 108     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 109
 110   switch (MI->getOpcode()) {
 111   default:
 112     if (TII->isLDSInstr(MI->getOpcode()) &&
 113         TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst) != -1) {
 114       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 115       assert(DstIdx != -1);
 116       MachineInstrBuilder NewMI;
 117       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) {
 118         NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()),
 119                         AMDGPU::OQAP);
 120         TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
 121                                      MI->getOperand(0).getReg(),
 122                                      AMDGPU::OQAP);
 123       } else {
 124         NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 125                         TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 126       }
 127       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 128         NewMI.addOperand(MI->getOperand(i));
 129       }
 130     } else {
 131       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 132     }
 133     break;
 134   case AMDGPU::CLAMP_R600: {
 135     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 136                                                    AMDGPU::MOV,
 137                                                    MI->getOperand(0).getReg(),
 138                                                    MI->getOperand(1).getReg());
 139     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 140     break;
 141   }
 142
 143   case AMDGPU::FABS_R600: {
 144     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 145                                                     AMDGPU::MOV,
 146                                                     MI->getOperand(0).getReg(),
 147                                                     MI->getOperand(1).getReg());
 148     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 149     break;
 150   }
 151
 152   case AMDGPU::FNEG_R600: {
 153     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 154                                                     AMDGPU::MOV,
 155                                                     MI->getOperand(0).getReg(),
 156                                                     MI->getOperand(1).getReg());
 157     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 158     break;
 159   }
 160
 161   case AMDGPU::MASK_WRITE: {
 162     unsigned maskedRegister = MI->getOperand(0).getReg();
 163     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 164     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 165     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 166     break;
 167   }
 168
 169   case AMDGPU::MOV_IMM_F32:
 170     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 171                      MI->getOperand(1).getFPImm()->getValueAPF()
 172                          .bitcastToAPInt().getZExtValue());
 173     break;
 174   case AMDGPU::MOV_IMM_I32:
 175     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 176                      MI->getOperand(1).getImm());
 177     break;
 178   case AMDGPU::CONST_COPY: {
 179     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 180         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 181     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 182         MI->getOperand(1).getImm());
 183     break;
 184   }
 185
 186   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 187   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 188   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 189     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 190
 191     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 192             .addOperand(MI->getOperand(0))
 193             .addOperand(MI->getOperand(1))
 194             .addImm(EOP); // Set End of program bit
 195     break;
 196   }
 197
 198   case AMDGPU::TXD: {
 199     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 200     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 201     MachineOperand &RID = MI->getOperand(4);
 202     MachineOperand &SID = MI->getOperand(5);
 203     unsigned TextureId = MI->getOperand(6).getImm();
 204     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 205     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 206
 207     switch (TextureId) {
 208     case 5: // Rect
 209       CTX = CTY = 0;
 210       break;
 211     case 6: // Shadow1D
 212       SrcW = SrcZ;
 213       break;
 214     case 7: // Shadow2D
 215       SrcW = SrcZ;
 216       break;
 217     case 8: // ShadowRect
 218       CTX = CTY = 0;
 219       SrcW = SrcZ;
 220       break;
 221     case 9: // 1DArray
 222       SrcZ = SrcY;
 223       CTZ = 0;
 224       break;
 225     case 10: // 2DArray
 226       CTZ = 0;
 227       break;
 228     case 11: // Shadow1DArray
 229       SrcZ = SrcY;
 230       CTZ = 0;
 231       break;
 232     case 12: // Shadow2DArray
 233       CTZ = 0;
 234       break;
 235     }
 236     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 237             .addOperand(MI->getOperand(3))
 238             .addImm(SrcX)
 239             .addImm(SrcY)
 240             .addImm(SrcZ)
 241             .addImm(SrcW)
 242             .addImm(0)
 243             .addImm(0)
 244             .addImm(0)
 245             .addImm(0)
 246             .addImm(1)
 247             .addImm(2)
 248             .addImm(3)
 249             .addOperand(RID)
 250             .addOperand(SID)
 251             .addImm(CTX)
 252             .addImm(CTY)
 253             .addImm(CTZ)
 254             .addImm(CTW);
 255     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 256             .addOperand(MI->getOperand(2))
 257             .addImm(SrcX)
 258             .addImm(SrcY)
 259             .addImm(SrcZ)
 260             .addImm(SrcW)
 261             .addImm(0)
 262             .addImm(0)
 263             .addImm(0)
 264             .addImm(0)
 265             .addImm(1)
 266             .addImm(2)
 267             .addImm(3)
 268             .addOperand(RID)
 269             .addOperand(SID)
 270             .addImm(CTX)
 271             .addImm(CTY)
 272             .addImm(CTZ)
 273             .addImm(CTW);
 274     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 275             .addOperand(MI->getOperand(0))
 276             .addOperand(MI->getOperand(1))
 277             .addImm(SrcX)
 278             .addImm(SrcY)
 279             .addImm(SrcZ)
 280             .addImm(SrcW)
 281             .addImm(0)
 282             .addImm(0)
 283             .addImm(0)
 284             .addImm(0)
 285             .addImm(1)
 286             .addImm(2)
 287             .addImm(3)
 288             .addOperand(RID)
 289             .addOperand(SID)
 290             .addImm(CTX)
 291             .addImm(CTY)
 292             .addImm(CTZ)
 293             .addImm(CTW)
 294             .addReg(T0, RegState::Implicit)
 295             .addReg(T1, RegState::Implicit);
 296     break;
 297   }
 298
 299   case AMDGPU::TXD_SHADOW: {
 300     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 301     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 302     MachineOperand &RID = MI->getOperand(4);
 303     MachineOperand &SID = MI->getOperand(5);
 304     unsigned TextureId = MI->getOperand(6).getImm();
 305     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 306     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 307
 308     switch (TextureId) {
 309     case 5: // Rect
 310       CTX = CTY = 0;
 311       break;
 312     case 6: // Shadow1D
 313       SrcW = SrcZ;
 314       break;
 315     case 7: // Shadow2D
 316       SrcW = SrcZ;
 317       break;
 318     case 8: // ShadowRect
 319       CTX = CTY = 0;
 320       SrcW = SrcZ;
 321       break;
 322     case 9: // 1DArray
 323       SrcZ = SrcY;
 324       CTZ = 0;
 325       break;
 326     case 10: // 2DArray
 327       CTZ = 0;
 328       break;
 329     case 11: // Shadow1DArray
 330       SrcZ = SrcY;
 331       CTZ = 0;
 332       break;
 333     case 12: // Shadow2DArray
 334       CTZ = 0;
 335       break;
 336     }
 337
 338     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 339             .addOperand(MI->getOperand(3))
 340             .addImm(SrcX)
 341             .addImm(SrcY)
 342             .addImm(SrcZ)
 343             .addImm(SrcW)
 344             .addImm(0)
 345             .addImm(0)
 346             .addImm(0)
 347             .addImm(0)
 348             .addImm(1)
 349             .addImm(2)
 350             .addImm(3)
 351             .addOperand(RID)
 352             .addOperand(SID)
 353             .addImm(CTX)
 354             .addImm(CTY)
 355             .addImm(CTZ)
 356             .addImm(CTW);
 357     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 358             .addOperand(MI->getOperand(2))
 359             .addImm(SrcX)
 360             .addImm(SrcY)
 361             .addImm(SrcZ)
 362             .addImm(SrcW)
 363             .addImm(0)
 364             .addImm(0)
 365             .addImm(0)
 366             .addImm(0)
 367             .addImm(1)
 368             .addImm(2)
 369             .addImm(3)
 370             .addOperand(RID)
 371             .addOperand(SID)
 372             .addImm(CTX)
 373             .addImm(CTY)
 374             .addImm(CTZ)
 375             .addImm(CTW);
 376     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 377             .addOperand(MI->getOperand(0))
 378             .addOperand(MI->getOperand(1))
 379             .addImm(SrcX)
 380             .addImm(SrcY)
 381             .addImm(SrcZ)
 382             .addImm(SrcW)
 383             .addImm(0)
 384             .addImm(0)
 385             .addImm(0)
 386             .addImm(0)
 387             .addImm(1)
 388             .addImm(2)
 389             .addImm(3)
 390             .addOperand(RID)
 391             .addOperand(SID)
 392             .addImm(CTX)
 393             .addImm(CTY)
 394             .addImm(CTZ)
 395             .addImm(CTW)
 396             .addReg(T0, RegState::Implicit)
 397             .addReg(T1, RegState::Implicit);
 398     break;
 399   }
 400
 401   case AMDGPU::BRANCH:
 402       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 403               .addOperand(MI->getOperand(0));
 404       break;
 405
 406   case AMDGPU::BRANCH_COND_f32: {
 407     MachineInstr *NewMI =
 408       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 409               AMDGPU::PREDICATE_BIT)
 410               .addOperand(MI->getOperand(1))
 411               .addImm(OPCODE_IS_NOT_ZERO)
 412               .addImm(0); // Flags
 413     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 414     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 415             .addOperand(MI->getOperand(0))
 416             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 417     break;
 418   }
 419
 420   case AMDGPU::BRANCH_COND_i32: {
 421     MachineInstr *NewMI =
 422       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 423             AMDGPU::PREDICATE_BIT)
 424             .addOperand(MI->getOperand(1))
 425             .addImm(OPCODE_IS_NOT_ZERO_INT)
 426             .addImm(0); // Flags
 427     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 428     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 429            .addOperand(MI->getOperand(0))
 430             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 431     break;
 432   }
 433
 434   case AMDGPU::EG_ExportSwz:
 435   case AMDGPU::R600_ExportSwz: {
 436     // Instruction is left unmodified if its not the last one of its type
 437     bool isLastInstructionOfItsType = true;
 438     unsigned InstExportType = MI->getOperand(1).getImm();
 439     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 440          EndBlock = BB->end(); NextExportInst != EndBlock;
 441          NextExportInst = llvm::next(NextExportInst)) {
 442       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 443           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 444         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 445             .getImm();
 446         if (CurrentInstExportType == InstExportType) {
 447           isLastInstructionOfItsType = false;
 448           break;
 449         }
 450       }
 451     }
 452     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 453     if (!EOP && !isLastInstructionOfItsType)
 454       return BB;
 455     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 456     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 457             .addOperand(MI->getOperand(0))
 458             .addOperand(MI->getOperand(1))
 459             .addOperand(MI->getOperand(2))
 460             .addOperand(MI->getOperand(3))
 461             .addOperand(MI->getOperand(4))
 462             .addOperand(MI->getOperand(5))
 463             .addOperand(MI->getOperand(6))
 464             .addImm(CfInst)
 465             .addImm(EOP);
 466     break;
 467   }
 468   case AMDGPU::RETURN: {
 469     // RETURN instructions must have the live-out registers as implicit uses,
 470     // otherwise they appear dead.
 471     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 472     MachineInstrBuilder MIB(*MF, MI);
 473     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 474       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 475     return BB;
 476   }
 477   }
 478
 479   MI->eraseFromParent();
 480   return BB;
 481 }
 482
 483 //===----------------------------------------------------------------------===//
 484 // Custom DAG Lowering Operations
 485 //===----------------------------------------------------------------------===//
 486
 487 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 488   MachineFunction &MF = DAG.getMachineFunction();
 489   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 490   switch (Op.getOpcode()) {
 491   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 492   case ISD::FCOS:
 493   case ISD::FSIN: return LowerTrig(Op, DAG);
 494   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 495   case ISD::STORE: return LowerSTORE(Op, DAG);
 496   case ISD::LOAD: return LowerLOAD(Op, DAG);
 497   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 498   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 499   case ISD::INTRINSIC_VOID: {
 500     SDValue Chain = Op.getOperand(0);
 501     unsigned IntrinsicID =
 502                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 503     switch (IntrinsicID) {
 504     case AMDGPUIntrinsic::AMDGPU_store_output: {
 505       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 506       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 507       MFI->LiveOuts.push_back(Reg);
 508       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 509     }
 510     case AMDGPUIntrinsic::R600_store_swizzle: {
 511       const SDValue Args[8] = {
 512         Chain,
 513         Op.getOperand(2), // Export Value
 514         Op.getOperand(3), // ArrayBase
 515         Op.getOperand(4), // Type
 516         DAG.getConstant(0, MVT::i32), // SWZ_X
 517         DAG.getConstant(1, MVT::i32), // SWZ_Y
 518         DAG.getConstant(2, MVT::i32), // SWZ_Z
 519         DAG.getConstant(3, MVT::i32) // SWZ_W
 520       };
 521       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 522           Args, 8);
 523     }
 524
 525     // default for switch(IntrinsicID)
 526     default: break;
 527     }
 528     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 529     break;
 530   }
 531   case ISD::INTRINSIC_WO_CHAIN: {
 532     unsigned IntrinsicID =
 533                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 534     EVT VT = Op.getValueType();
 535     SDLoc DL(Op);
 536     switch(IntrinsicID) {
 537     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 538     case AMDGPUIntrinsic::R600_load_input: {
 539       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 540       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 541       MachineFunction &MF = DAG.getMachineFunction();
 542       MachineRegisterInfo &MRI = MF.getRegInfo();
 543       MRI.addLiveIn(Reg);
 544       return DAG.getCopyFromReg(DAG.getEntryNode(),
 545           SDLoc(DAG.getEntryNode()), Reg, VT);
 546     }
 547
 548     case AMDGPUIntrinsic::R600_interp_input: {
 549       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 550       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 551       MachineSDNode *interp;
 552       if (ijb < 0) {
 553         const MachineFunction &MF = DAG.getMachineFunction();
 554         const R600InstrInfo *TII =
 555           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 556         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 557             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 558         return DAG.getTargetExtractSubreg(
 559             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 560             DL, MVT::f32, SDValue(interp, 0));
 561       }
 562
 563       MachineFunction &MF = DAG.getMachineFunction();
 564       MachineRegisterInfo &MRI = MF.getRegInfo();
 565       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 566       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 567       MRI.addLiveIn(RegisterI);
 568       MRI.addLiveIn(RegisterJ);
 569       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 570           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 571       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 572           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 573
 574       if (slot % 4 < 2)
 575         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 576             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 577             RegisterJNode, RegisterINode);
 578       else
 579         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 580             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 581             RegisterJNode, RegisterINode);
 582       return SDValue(interp, slot % 2);
 583     }
 584     case AMDGPUIntrinsic::R600_tex:
 585     case AMDGPUIntrinsic::R600_texc:
 586     case AMDGPUIntrinsic::R600_txl:
 587     case AMDGPUIntrinsic::R600_txlc:
 588     case AMDGPUIntrinsic::R600_txb:
 589     case AMDGPUIntrinsic::R600_txbc:
 590     case AMDGPUIntrinsic::R600_txf:
 591     case AMDGPUIntrinsic::R600_txq:
 592     case AMDGPUIntrinsic::R600_ddx:
 593     case AMDGPUIntrinsic::R600_ddy: {
 594       unsigned TextureOp;
 595       switch (IntrinsicID) {
 596       case AMDGPUIntrinsic::R600_tex:
 597         TextureOp = 0;
 598         break;
 599       case AMDGPUIntrinsic::R600_texc:
 600         TextureOp = 1;
 601         break;
 602       case AMDGPUIntrinsic::R600_txl:
 603         TextureOp = 2;
 604         break;
 605       case AMDGPUIntrinsic::R600_txlc:
 606         TextureOp = 3;
 607         break;
 608       case AMDGPUIntrinsic::R600_txb:
 609         TextureOp = 4;
 610         break;
 611       case AMDGPUIntrinsic::R600_txbc:
 612         TextureOp = 5;
 613         break;
 614       case AMDGPUIntrinsic::R600_txf:
 615         TextureOp = 6;
 616         break;
 617       case AMDGPUIntrinsic::R600_txq:
 618         TextureOp = 7;
 619         break;
 620       case AMDGPUIntrinsic::R600_ddx:
 621         TextureOp = 8;
 622         break;
 623       case AMDGPUIntrinsic::R600_ddy:
 624         TextureOp = 9;
 625         break;
 626       default:
 627         llvm_unreachable("Unknow Texture Operation");
 628       }
 629
 630       SDValue TexArgs[19] = {
 631         DAG.getConstant(TextureOp, MVT::i32),
 632         Op.getOperand(1),
 633         DAG.getConstant(0, MVT::i32),
 634         DAG.getConstant(1, MVT::i32),
 635         DAG.getConstant(2, MVT::i32),
 636         DAG.getConstant(3, MVT::i32),
 637         Op.getOperand(2),
 638         Op.getOperand(3),
 639         Op.getOperand(4),
 640         DAG.getConstant(0, MVT::i32),
 641         DAG.getConstant(1, MVT::i32),
 642         DAG.getConstant(2, MVT::i32),
 643         DAG.getConstant(3, MVT::i32),
 644         Op.getOperand(5),
 645         Op.getOperand(6),
 646         Op.getOperand(7),
 647         Op.getOperand(8),
 648         Op.getOperand(9),
 649         Op.getOperand(10)
 650       };
 651       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 652     }
 653     case AMDGPUIntrinsic::AMDGPU_dp4: {
 654       SDValue Args[8] = {
 655       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 656           DAG.getConstant(0, MVT::i32)),
 657       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 658           DAG.getConstant(0, MVT::i32)),
 659       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 660           DAG.getConstant(1, MVT::i32)),
 661       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 662           DAG.getConstant(1, MVT::i32)),
 663       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 664           DAG.getConstant(2, MVT::i32)),
 665       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 666           DAG.getConstant(2, MVT::i32)),
 667       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 668           DAG.getConstant(3, MVT::i32)),
 669       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 670           DAG.getConstant(3, MVT::i32))
 671       };
 672       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 673     }
 674
 675     case Intrinsic::r600_read_ngroups_x:
 676       return LowerImplicitParameter(DAG, VT, DL, 0);
 677     case Intrinsic::r600_read_ngroups_y:
 678       return LowerImplicitParameter(DAG, VT, DL, 1);
 679     case Intrinsic::r600_read_ngroups_z:
 680       return LowerImplicitParameter(DAG, VT, DL, 2);
 681     case Intrinsic::r600_read_global_size_x:
 682       return LowerImplicitParameter(DAG, VT, DL, 3);
 683     case Intrinsic::r600_read_global_size_y:
 684       return LowerImplicitParameter(DAG, VT, DL, 4);
 685     case Intrinsic::r600_read_global_size_z:
 686       return LowerImplicitParameter(DAG, VT, DL, 5);
 687     case Intrinsic::r600_read_local_size_x:
 688       return LowerImplicitParameter(DAG, VT, DL, 6);
 689     case Intrinsic::r600_read_local_size_y:
 690       return LowerImplicitParameter(DAG, VT, DL, 7);
 691     case Intrinsic::r600_read_local_size_z:
 692       return LowerImplicitParameter(DAG, VT, DL, 8);
 693
 694     case Intrinsic::r600_read_tgid_x:
 695       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 696                                   AMDGPU::T1_X, VT);
 697     case Intrinsic::r600_read_tgid_y:
 698       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 699                                   AMDGPU::T1_Y, VT);
 700     case Intrinsic::r600_read_tgid_z:
 701       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 702                                   AMDGPU::T1_Z, VT);
 703     case Intrinsic::r600_read_tidig_x:
 704       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 705                                   AMDGPU::T0_X, VT);
 706     case Intrinsic::r600_read_tidig_y:
 707       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 708                                   AMDGPU::T0_Y, VT);
 709     case Intrinsic::r600_read_tidig_z:
 710       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 711                                   AMDGPU::T0_Z, VT);
 712     }
 713     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 714     break;
 715   }
 716   } // end switch(Op.getOpcode())
 717   return SDValue();
 718 }
 719
 720 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 721                                             SmallVectorImpl<SDValue> &Results,
 722                                             SelectionDAG &DAG) const {
 723   switch (N->getOpcode()) {
 724   default: return;
 725   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 726     return;
 727   case ISD::LOAD: {
 728     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 729     Results.push_back(SDValue(Node, 0));
 730     Results.push_back(SDValue(Node, 1));
 731     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 732     // function
 733     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 734     return;
 735   }
 736   case ISD::STORE:
 737     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 738     Results.push_back(SDValue(Node, 0));
 739     return;
 740   }
 741 }
 742
 743 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 744   // On hw >= R700, COS/SIN input must be between -1. and 1.
 745   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 746   EVT VT = Op.getValueType();
 747   SDValue Arg = Op.getOperand(0);
 748   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 749       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 750         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 751           DAG.getConstantFP(0.15915494309, MVT::f32)),
 752         DAG.getConstantFP(0.5, MVT::f32)));
 753   unsigned TrigNode;
 754   switch (Op.getOpcode()) {
 755   case ISD::FCOS:
 756     TrigNode = AMDGPUISD::COS_HW;
 757     break;
 758   case ISD::FSIN:
 759     TrigNode = AMDGPUISD::SIN_HW;
 760     break;
 761   default:
 762     llvm_unreachable("Wrong trig opcode");
 763   }
 764   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 765       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 766         DAG.getConstantFP(-0.5, MVT::f32)));
 767   if (Gen >= AMDGPUSubtarget::R700)
 768     return TrigVal;
 769   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 770   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 771       DAG.getConstantFP(3.14159265359, MVT::f32));
 772 }
 773
 774 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 775   return DAG.getNode(
 776       ISD::SETCC,
 777       SDLoc(Op),
 778       MVT::i1,
 779       Op, DAG.getConstantFP(0.0f, MVT::f32),
 780       DAG.getCondCode(ISD::SETNE)
 781       );
 782 }
 783
 784 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 785                                                    SDLoc DL,
 786                                                    unsigned DwordOffset) const {
 787   unsigned ByteOffset = DwordOffset * 4;
 788   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 789                                       AMDGPUAS::CONSTANT_BUFFER_0);
 790
 791   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 792   assert(isInt<16>(ByteOffset));
 793
 794   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 795                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 796                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 797                      false, false, false, 0);
 798 }
 799
 800 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 801
 802   MachineFunction &MF = DAG.getMachineFunction();
 803   const AMDGPUFrameLowering *TFL =
 804    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 805
 806   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 807   assert(FIN);
 808
 809   unsigned FrameIndex = FIN->getIndex();
 810   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 811   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 812 }
 813
 814 bool R600TargetLowering::isZero(SDValue Op) const {
 815   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 816     return Cst->isNullValue();
 817   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 818     return CstFP->isZero();
 819   } else {
 820     return false;
 821   }
 822 }
 823
 824 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 825   SDLoc DL(Op);
 826   EVT VT = Op.getValueType();
 827
 828   SDValue LHS = Op.getOperand(0);
 829   SDValue RHS = Op.getOperand(1);
 830   SDValue True = Op.getOperand(2);
 831   SDValue False = Op.getOperand(3);
 832   SDValue CC = Op.getOperand(4);
 833   SDValue Temp;
 834
 835   // LHS and RHS are guaranteed to be the same value type
 836   EVT CompareVT = LHS.getValueType();
 837
 838   // Check if we can lower this to a native operation.
 839
 840   // Try to lower to a SET* instruction:
 841   //
 842   // SET* can match the following patterns:
 843   //
 844   // select_cc f32, f32, -1,  0, cc_any
 845   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 846   // select_cc i32, i32, -1,  0, cc_any
 847   //
 848
 849   // Move hardware True/False values to the correct operand.
 850   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 851     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 852     std::swap(False, True);
 853     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 854   }
 855
 856   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 857       (CompareVT == VT || VT == MVT::i32)) {
 858     // This can be matched by a SET* instruction.
 859     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 860   }
 861
 862   // Try to lower to a CND* instruction:
 863   //
 864   // CND* can match the following patterns:
 865   //
 866   // select_cc f32, 0.0, f32, f32, cc_any
 867   // select_cc f32, 0.0, i32, i32, cc_any
 868   // select_cc i32, 0,   f32, f32, cc_any
 869   // select_cc i32, 0,   i32, i32, cc_any
 870   //
 871   if (isZero(LHS) || isZero(RHS)) {
 872     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 873     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 874     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 875     if (CompareVT != VT) {
 876       // Bitcast True / False to the correct types.  This will end up being
 877       // a nop, but it allows us to define only a single pattern in the
 878       // .TD files for each CND* instruction rather than having to have
 879       // one pattern for integer True/False and one for fp True/False
 880       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 881       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 882     }
 883     if (isZero(LHS)) {
 884       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 885     }
 886
 887     switch (CCOpcode) {
 888     case ISD::SETONE:
 889     case ISD::SETUNE:
 890     case ISD::SETNE:
 891     case ISD::SETULE:
 892     case ISD::SETULT:
 893     case ISD::SETOLE:
 894     case ISD::SETOLT:
 895     case ISD::SETLE:
 896     case ISD::SETLT:
 897       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 898       Temp = True;
 899       True = False;
 900       False = Temp;
 901       break;
 902     default:
 903       break;
 904     }
 905     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 906         Cond, Zero,
 907         True, False,
 908         DAG.getCondCode(CCOpcode));
 909     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 910   }
 911
 912
 913   // Possible Min/Max pattern
 914   SDValue MinMax = LowerMinMax(Op, DAG);
 915   if (MinMax.getNode()) {
 916     return MinMax;
 917   }
 918
 919   // If we make it this for it means we have no native instructions to handle
 920   // this SELECT_CC, so we must lower it.
 921   SDValue HWTrue, HWFalse;
 922
 923   if (CompareVT == MVT::f32) {
 924     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 925     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 926   } else if (CompareVT == MVT::i32) {
 927     HWTrue = DAG.getConstant(-1, CompareVT);
 928     HWFalse = DAG.getConstant(0, CompareVT);
 929   }
 930   else {
 931     assert(!"Unhandled value type in LowerSELECT_CC");
 932   }
 933
 934   // Lower this unsupported SELECT_CC into a combination of two supported
 935   // SELECT_CC operations.
 936   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 937
 938   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 939       Cond, HWFalse,
 940       True, False,
 941       DAG.getCondCode(ISD::SETNE));
 942 }
 943
 944 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 945 /// convert these pointers to a register index.  Each register holds
 946 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 947 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 948 /// for indirect addressing.
 949 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 950                                                unsigned StackWidth,
 951                                                SelectionDAG &DAG) const {
 952   unsigned SRLPad;
 953   switch(StackWidth) {
 954   case 1:
 955     SRLPad = 2;
 956     break;
 957   case 2:
 958     SRLPad = 3;
 959     break;
 960   case 4:
 961     SRLPad = 4;
 962     break;
 963   default: llvm_unreachable("Invalid stack width");
 964   }
 965
 966   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
 967                      DAG.getConstant(SRLPad, MVT::i32));
 968 }
 969
 970 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 971                                          unsigned ElemIdx,
 972                                          unsigned &Channel,
 973                                          unsigned &PtrIncr) const {
 974   switch (StackWidth) {
 975   default:
 976   case 1:
 977     Channel = 0;
 978     if (ElemIdx > 0) {
 979       PtrIncr = 1;
 980     } else {
 981       PtrIncr = 0;
 982     }
 983     break;
 984   case 2:
 985     Channel = ElemIdx % 2;
 986     if (ElemIdx == 2) {
 987       PtrIncr = 1;
 988     } else {
 989       PtrIncr = 0;
 990     }
 991     break;
 992   case 4:
 993     Channel = ElemIdx;
 994     PtrIncr = 0;
 995     break;
 996   }
 997 }
 998
 999 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1000   SDLoc DL(Op);
1001   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1002   SDValue Chain = Op.getOperand(0);
1003   SDValue Value = Op.getOperand(1);
1004   SDValue Ptr = Op.getOperand(2);
1005
1006   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1007   if (Result.getNode()) {
1008     return Result;
1009   }
1010
1011   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1012     if (StoreNode->isTruncatingStore()) {
1013       EVT VT = Value.getValueType();
1014       assert(VT.bitsLE(MVT::i32));
1015       EVT MemVT = StoreNode->getMemoryVT();
1016       SDValue MaskConstant;
1017       if (MemVT == MVT::i8) {
1018         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1019       } else {
1020         assert(MemVT == MVT::i16);
1021         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1022       }
1023       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1024                                       DAG.getConstant(2, MVT::i32));
1025       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1026                                       DAG.getConstant(0x00000003, VT));
1027       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1028       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1029                                    DAG.getConstant(3, VT));
1030       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1031       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1032       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1033       // vector instead.
1034       SDValue Src[4] = {
1035         ShiftedValue,
1036         DAG.getConstant(0, MVT::i32),
1037         DAG.getConstant(0, MVT::i32),
1038         Mask
1039       };
1040       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1041       SDValue Args[3] = { Chain, Input, DWordAddr };
1042       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1043                                      Op->getVTList(), Args, 3, MemVT,
1044                                      StoreNode->getMemOperand());
1045     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1046                Value.getValueType().bitsGE(MVT::i32)) {
1047       // Convert pointer from byte address to dword address.
1048       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1049                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1050                                     Ptr, DAG.getConstant(2, MVT::i32)));
1051
1052       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1053         assert(!"Truncated and indexed stores not supported yet");
1054       } else {
1055         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1056       }
1057       return Chain;
1058     }
1059   }
1060
1061   EVT ValueVT = Value.getValueType();
1062
1063   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1064     return SDValue();
1065   }
1066
1067   // Lowering for indirect addressing
1068
1069   const MachineFunction &MF = DAG.getMachineFunction();
1070   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1071                                          getTargetMachine().getFrameLowering());
1072   unsigned StackWidth = TFL->getStackWidth(MF);
1073
1074   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1075
1076   if (ValueVT.isVector()) {
1077     unsigned NumElemVT = ValueVT.getVectorNumElements();
1078     EVT ElemVT = ValueVT.getVectorElementType();
1079     SDValue Stores[4];
1080
1081     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1082                                       "vector width in load");
1083
1084     for (unsigned i = 0; i < NumElemVT; ++i) {
1085       unsigned Channel, PtrIncr;
1086       getStackAddress(StackWidth, i, Channel, PtrIncr);
1087       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1088                         DAG.getConstant(PtrIncr, MVT::i32));
1089       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1090                                  Value, DAG.getConstant(i, MVT::i32));
1091
1092       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1093                               Chain, Elem, Ptr,
1094                               DAG.getTargetConstant(Channel, MVT::i32));
1095     }
1096      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1097    } else {
1098     if (ValueVT == MVT::i8) {
1099       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1100     }
1101     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1102     DAG.getTargetConstant(0, MVT::i32)); // Channel
1103   }
1104
1105   return Chain;
1106 }
1107
1108 // return (512 + (kc_bank << 12)
1109 static int
1110 ConstantAddressBlock(unsigned AddressSpace) {
1111   switch (AddressSpace) {
1112   case AMDGPUAS::CONSTANT_BUFFER_0:
1113     return 512;
1114   case AMDGPUAS::CONSTANT_BUFFER_1:
1115     return 512 + 4096;
1116   case AMDGPUAS::CONSTANT_BUFFER_2:
1117     return 512 + 4096 * 2;
1118   case AMDGPUAS::CONSTANT_BUFFER_3:
1119     return 512 + 4096 * 3;
1120   case AMDGPUAS::CONSTANT_BUFFER_4:
1121     return 512 + 4096 * 4;
1122   case AMDGPUAS::CONSTANT_BUFFER_5:
1123     return 512 + 4096 * 5;
1124   case AMDGPUAS::CONSTANT_BUFFER_6:
1125     return 512 + 4096 * 6;
1126   case AMDGPUAS::CONSTANT_BUFFER_7:
1127     return 512 + 4096 * 7;
1128   case AMDGPUAS::CONSTANT_BUFFER_8:
1129     return 512 + 4096 * 8;
1130   case AMDGPUAS::CONSTANT_BUFFER_9:
1131     return 512 + 4096 * 9;
1132   case AMDGPUAS::CONSTANT_BUFFER_10:
1133     return 512 + 4096 * 10;
1134   case AMDGPUAS::CONSTANT_BUFFER_11:
1135     return 512 + 4096 * 11;
1136   case AMDGPUAS::CONSTANT_BUFFER_12:
1137     return 512 + 4096 * 12;
1138   case AMDGPUAS::CONSTANT_BUFFER_13:
1139     return 512 + 4096 * 13;
1140   case AMDGPUAS::CONSTANT_BUFFER_14:
1141     return 512 + 4096 * 14;
1142   case AMDGPUAS::CONSTANT_BUFFER_15:
1143     return 512 + 4096 * 15;
1144   default:
1145     return -1;
1146   }
1147 }
1148
1149 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1150 {
1151   EVT VT = Op.getValueType();
1152   SDLoc DL(Op);
1153   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1154   SDValue Chain = Op.getOperand(0);
1155   SDValue Ptr = Op.getOperand(1);
1156   SDValue LoweredLoad;
1157
1158   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1159     SDValue MergedValues[2] = {
1160       SplitVectorLoad(Op, DAG),
1161       Chain
1162     };
1163     return DAG.getMergeValues(MergedValues, 2, DL);
1164   }
1165
1166   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1167   if (ConstantBlock > -1) {
1168     SDValue Result;
1169     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1170         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1171         dyn_cast<ConstantSDNode>(Ptr)) {
1172       SDValue Slots[4];
1173       for (unsigned i = 0; i < 4; i++) {
1174         // We want Const position encoded with the following formula :
1175         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1176         // const_index is Ptr computed by llvm using an alignment of 16.
1177         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1178         // then div by 4 at the ISel step
1179         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1180             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1181         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1182       }
1183       EVT NewVT = MVT::v4i32;
1184       unsigned NumElements = 4;
1185       if (VT.isVector()) {
1186         NewVT = VT;
1187         NumElements = VT.getVectorNumElements();
1188       }
1189       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1190     } else {
1191       // non constant ptr cant be folded, keeps it as a v4f32 load
1192       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1193           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1194           DAG.getConstant(LoadNode->getAddressSpace() -
1195                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1196           );
1197     }
1198
1199     if (!VT.isVector()) {
1200       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1201           DAG.getConstant(0, MVT::i32));
1202     }
1203
1204     SDValue MergedValues[2] = {
1205         Result,
1206         Chain
1207     };
1208     return DAG.getMergeValues(MergedValues, 2, DL);
1209   }
1210
1211   // For most operations returning SDValue() will result int he node being
1212   // expanded by the DAG Legalizer.  This is not the case for ISD::LOAD, so
1213   // we need to manually expand loads that may be legal in some address spaces
1214   // and illegal in others.  SEXT loads from CONSTANT_BUFFER_0 are supported
1215   // for compute shaders, since the data is sign extended when it is uploaded
1216   // to the buffer.  Howerver SEXT loads from other addresspaces are not
1217   // supported, so we need to expand them here.
1218   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1219     EVT MemVT = LoadNode->getMemoryVT();
1220     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1221     SDValue ShiftAmount =
1222           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1223     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1224                                   LoadNode->getPointerInfo(), MemVT,
1225                                   LoadNode->isVolatile(),
1226                                   LoadNode->isNonTemporal(),
1227                                   LoadNode->getAlignment());
1228     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1229     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1230
1231     SDValue MergedValues[2] = { Sra, Chain };
1232     return DAG.getMergeValues(MergedValues, 2, DL);
1233   }
1234
1235   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1236     return SDValue();
1237   }
1238
1239   // Lowering for indirect addressing
1240   const MachineFunction &MF = DAG.getMachineFunction();
1241   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1242                                          getTargetMachine().getFrameLowering());
1243   unsigned StackWidth = TFL->getStackWidth(MF);
1244
1245   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1246
1247   if (VT.isVector()) {
1248     unsigned NumElemVT = VT.getVectorNumElements();
1249     EVT ElemVT = VT.getVectorElementType();
1250     SDValue Loads[4];
1251
1252     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1253                                       "vector width in load");
1254
1255     for (unsigned i = 0; i < NumElemVT; ++i) {
1256       unsigned Channel, PtrIncr;
1257       getStackAddress(StackWidth, i, Channel, PtrIncr);
1258       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1259                         DAG.getConstant(PtrIncr, MVT::i32));
1260       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1261                              Chain, Ptr,
1262                              DAG.getTargetConstant(Channel, MVT::i32),
1263                              Op.getOperand(2));
1264     }
1265     for (unsigned i = NumElemVT; i < 4; ++i) {
1266       Loads[i] = DAG.getUNDEF(ElemVT);
1267     }
1268     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1269     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1270   } else {
1271     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1272                               Chain, Ptr,
1273                               DAG.getTargetConstant(0, MVT::i32), // Channel
1274                               Op.getOperand(2));
1275   }
1276
1277   SDValue Ops[2];
1278   Ops[0] = LoweredLoad;
1279   Ops[1] = Chain;
1280
1281   return DAG.getMergeValues(Ops, 2, DL);
1282 }
1283
1284 /// XXX Only kernel functions are supported, so we can assume for now that
1285 /// every function is a kernel function, but in the future we should use
1286 /// separate calling conventions for kernel and non-kernel functions.
1287 SDValue R600TargetLowering::LowerFormalArguments(
1288                                       SDValue Chain,
1289                                       CallingConv::ID CallConv,
1290                                       bool isVarArg,
1291                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1292                                       SDLoc DL, SelectionDAG &DAG,
1293                                       SmallVectorImpl<SDValue> &InVals) const {
1294   SmallVector<CCValAssign, 16> ArgLocs;
1295   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1296                  getTargetMachine(), ArgLocs, *DAG.getContext());
1297
1298   AnalyzeFormalArguments(CCInfo, Ins);
1299
1300   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1301     CCValAssign &VA = ArgLocs[i];
1302     EVT VT = VA.getLocVT();
1303
1304     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1305                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1306
1307     // The first 36 bytes of the input buffer contains information about
1308     // thread group and global sizes.
1309     SDValue Arg = DAG.getLoad(VT, DL, Chain,
1310                            DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1311                            MachinePointerInfo(UndefValue::get(PtrTy)), false,
1312                            false, false, 4); // 4 is the prefered alignment for
1313                                              // the CONSTANT memory space.
1314     InVals.push_back(Arg);
1315   }
1316   return Chain;
1317 }
1318
1319 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1320    if (!VT.isVector()) return MVT::i32;
1321    return VT.changeVectorElementTypeToInteger();
1322 }
1323
1324 static SDValue
1325 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1326                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1327   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1328   assert(RemapSwizzle.empty());
1329   SDValue NewBldVec[4] = {
1330       VectorEntry.getOperand(0),
1331       VectorEntry.getOperand(1),
1332       VectorEntry.getOperand(2),
1333       VectorEntry.getOperand(3)
1334   };
1335
1336   for (unsigned i = 0; i < 4; i++) {
1337     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1338       if (C->isZero()) {
1339         RemapSwizzle[i] = 4; // SEL_0
1340         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1341       } else if (C->isExactlyValue(1.0)) {
1342         RemapSwizzle[i] = 5; // SEL_1
1343         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1344       }
1345     }
1346
1347     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1348       continue;
1349     for (unsigned j = 0; j < i; j++) {
1350       if (NewBldVec[i] == NewBldVec[j]) {
1351         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1352         RemapSwizzle[i] = j;
1353         break;
1354       }
1355     }
1356   }
1357
1358   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1359       VectorEntry.getValueType(), NewBldVec, 4);
1360 }
1361
1362 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1363                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1364   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1365   assert(RemapSwizzle.empty());
1366   SDValue NewBldVec[4] = {
1367       VectorEntry.getOperand(0),
1368       VectorEntry.getOperand(1),
1369       VectorEntry.getOperand(2),
1370       VectorEntry.getOperand(3)
1371   };
1372   bool isUnmovable[4] = { false, false, false, false };
1373   for (unsigned i = 0; i < 4; i++)
1374     RemapSwizzle[i] = i;
1375
1376   for (unsigned i = 0; i < 4; i++) {
1377     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1378       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1379           ->getZExtValue();
1380       if (!isUnmovable[Idx]) {
1381         // Swap i and Idx
1382         std::swap(NewBldVec[Idx], NewBldVec[i]);
1383         std::swap(RemapSwizzle[RemapSwizzle[Idx]], RemapSwizzle[RemapSwizzle[i]]);
1384       }
1385       isUnmovable[Idx] = true;
1386     }
1387   }
1388
1389   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1390       VectorEntry.getValueType(), NewBldVec, 4);
1391 }
1392
1393
1394 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1395 SDValue Swz[4], SelectionDAG &DAG) const {
1396   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1397   // Old -> New swizzle values
1398   DenseMap<unsigned, unsigned> SwizzleRemap;
1399
1400   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1401   for (unsigned i = 0; i < 4; i++) {
1402     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1403     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1404       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1405   }
1406
1407   SwizzleRemap.clear();
1408   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1409   for (unsigned i = 0; i < 4; i++) {
1410     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1411     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1412       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1413   }
1414
1415   return BuildVector;
1416 }
1417
1418
1419 //===----------------------------------------------------------------------===//
1420 // Custom DAG Optimizations
1421 //===----------------------------------------------------------------------===//
1422
1423 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1424                                               DAGCombinerInfo &DCI) const {
1425   SelectionDAG &DAG = DCI.DAG;
1426
1427   switch (N->getOpcode()) {
1428   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1429   case ISD::FP_ROUND: {
1430       SDValue Arg = N->getOperand(0);
1431       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1432         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1433                            Arg.getOperand(0));
1434       }
1435       break;
1436     }
1437
1438   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1439   // (i32 select_cc f32, f32, -1, 0 cc)
1440   //
1441   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1442   // this to one of the SET*_DX10 instructions.
1443   case ISD::FP_TO_SINT: {
1444     SDValue FNeg = N->getOperand(0);
1445     if (FNeg.getOpcode() != ISD::FNEG) {
1446       return SDValue();
1447     }
1448     SDValue SelectCC = FNeg.getOperand(0);
1449     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1450         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1451         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1452         !isHWTrueValue(SelectCC.getOperand(2)) ||
1453         !isHWFalseValue(SelectCC.getOperand(3))) {
1454       return SDValue();
1455     }
1456
1457     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1458                            SelectCC.getOperand(0), // LHS
1459                            SelectCC.getOperand(1), // RHS
1460                            DAG.getConstant(-1, MVT::i32), // True
1461                            DAG.getConstant(0, MVT::i32),  // Flase
1462                            SelectCC.getOperand(4)); // CC
1463
1464     break;
1465   }
1466
1467   // insert_vector_elt (build_vector elt0, …, eltN), NewEltIdx, idx
1468   // => build_vector elt0, …, NewEltIdx, …, eltN
1469   case ISD::INSERT_VECTOR_ELT: {
1470     SDValue InVec = N->getOperand(0);
1471     SDValue InVal = N->getOperand(1);
1472     SDValue EltNo = N->getOperand(2);
1473     SDLoc dl(N);
1474
1475     // If the inserted element is an UNDEF, just use the input vector.
1476     if (InVal.getOpcode() == ISD::UNDEF)
1477       return InVec;
1478
1479     EVT VT = InVec.getValueType();
1480
1481     // If we can't generate a legal BUILD_VECTOR, exit
1482     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1483       return SDValue();
1484
1485     // Check that we know which element is being inserted
1486     if (!isa<ConstantSDNode>(EltNo))
1487       return SDValue();
1488     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1489
1490     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1491     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1492     // vector elements.
1493     SmallVector<SDValue, 8> Ops;
1494     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1495       Ops.append(InVec.getNode()->op_begin(),
1496                  InVec.getNode()->op_end());
1497     } else if (InVec.getOpcode() == ISD::UNDEF) {
1498       unsigned NElts = VT.getVectorNumElements();
1499       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1500     } else {
1501       return SDValue();
1502     }
1503
1504     // Insert the element
1505     if (Elt < Ops.size()) {
1506       // All the operands of BUILD_VECTOR must have the same type;
1507       // we enforce that here.
1508       EVT OpVT = Ops[0].getValueType();
1509       if (InVal.getValueType() != OpVT)
1510         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1511           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1512           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1513       Ops[Elt] = InVal;
1514     }
1515
1516     // Return the new vector
1517     return DAG.getNode(ISD::BUILD_VECTOR, dl,
1518                        VT, &Ops[0], Ops.size());
1519   }
1520
1521   // Extract_vec (Build_vector) generated by custom lowering
1522   // also needs to be customly combined
1523   case ISD::EXTRACT_VECTOR_ELT: {
1524     SDValue Arg = N->getOperand(0);
1525     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1526       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1527         unsigned Element = Const->getZExtValue();
1528         return Arg->getOperand(Element);
1529       }
1530     }
1531     if (Arg.getOpcode() == ISD::BITCAST &&
1532         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1533       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1534         unsigned Element = Const->getZExtValue();
1535         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1536             Arg->getOperand(0).getOperand(Element));
1537       }
1538     }
1539   }
1540
1541   case ISD::SELECT_CC: {
1542     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1543     //      selectcc x, y, a, b, inv(cc)
1544     //
1545     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1546     //      selectcc x, y, a, b, cc
1547     SDValue LHS = N->getOperand(0);
1548     if (LHS.getOpcode() != ISD::SELECT_CC) {
1549       return SDValue();
1550     }
1551
1552     SDValue RHS = N->getOperand(1);
1553     SDValue True = N->getOperand(2);
1554     SDValue False = N->getOperand(3);
1555     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1556
1557     if (LHS.getOperand(2).getNode() != True.getNode() ||
1558         LHS.getOperand(3).getNode() != False.getNode() ||
1559         RHS.getNode() != False.getNode()) {
1560       return SDValue();
1561     }
1562
1563     switch (NCC) {
1564     default: return SDValue();
1565     case ISD::SETNE: return LHS;
1566     case ISD::SETEQ: {
1567       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1568       LHSCC = ISD::getSetCCInverse(LHSCC,
1569                                   LHS.getOperand(0).getValueType().isInteger());
1570       return DAG.getSelectCC(SDLoc(N),
1571                              LHS.getOperand(0),
1572                              LHS.getOperand(1),
1573                              LHS.getOperand(2),
1574                              LHS.getOperand(3),
1575                              LHSCC);
1576     }
1577     }
1578   }
1579
1580   case AMDGPUISD::EXPORT: {
1581     SDValue Arg = N->getOperand(1);
1582     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1583       break;
1584
1585     SDValue NewArgs[8] = {
1586       N->getOperand(0), // Chain
1587       SDValue(),
1588       N->getOperand(2), // ArrayBase
1589       N->getOperand(3), // Type
1590       N->getOperand(4), // SWZ_X
1591       N->getOperand(5), // SWZ_Y
1592       N->getOperand(6), // SWZ_Z
1593       N->getOperand(7) // SWZ_W
1594     };
1595     SDLoc DL(N);
1596     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1597     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1598   }
1599   case AMDGPUISD::TEXTURE_FETCH: {
1600     SDValue Arg = N->getOperand(1);
1601     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1602       break;
1603
1604     SDValue NewArgs[19] = {
1605       N->getOperand(0),
1606       N->getOperand(1),
1607       N->getOperand(2),
1608       N->getOperand(3),
1609       N->getOperand(4),
1610       N->getOperand(5),
1611       N->getOperand(6),
1612       N->getOperand(7),
1613       N->getOperand(8),
1614       N->getOperand(9),
1615       N->getOperand(10),
1616       N->getOperand(11),
1617       N->getOperand(12),
1618       N->getOperand(13),
1619       N->getOperand(14),
1620       N->getOperand(15),
1621       N->getOperand(16),
1622       N->getOperand(17),
1623       N->getOperand(18),
1624     };
1625     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1626     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1627         NewArgs, 19);
1628   }
1629   }
1630   return SDValue();
1631 }
1632
1633 static bool
1634 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
1635             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
1636   const R600InstrInfo *TII =
1637       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1638   if (!Src.isMachineOpcode())
1639     return false;
1640   switch (Src.getMachineOpcode()) {
1641   case AMDGPU::FNEG_R600:
1642     if (!Neg.getNode())
1643       return false;
1644     Src = Src.getOperand(0);
1645     Neg = DAG.getTargetConstant(1, MVT::i32);
1646     return true;
1647   case AMDGPU::FABS_R600:
1648     if (!Abs.getNode())
1649       return false;
1650     Src = Src.getOperand(0);
1651     Abs = DAG.getTargetConstant(1, MVT::i32);
1652     return true;
1653   case AMDGPU::CONST_COPY: {
1654     unsigned Opcode = ParentNode->getMachineOpcode();
1655     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1656
1657     if (!Sel.getNode())
1658       return false;
1659
1660     SDValue CstOffset = Src.getOperand(0);
1661     if (ParentNode->getValueType(0).isVector())
1662       return false;
1663
1664     // Gather constants values
1665     int SrcIndices[] = {
1666       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1667       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1668       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1669       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1670       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1671       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1672       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1673       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1674       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1675       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1676       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1677     };
1678     std::vector<unsigned> Consts;
1679     for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
1680       int OtherSrcIdx = SrcIndices[i];
1681       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1682       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1683         continue;
1684       if (HasDst) {
1685         OtherSrcIdx--;
1686         OtherSelIdx--;
1687       }
1688       if (RegisterSDNode *Reg =
1689           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1690         if (Reg->getReg() == AMDGPU::ALU_CONST) {
1691           ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
1692               ParentNode->getOperand(OtherSelIdx));
1693           Consts.push_back(Cst->getZExtValue());
1694         }
1695       }
1696     }
1697
1698     ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
1699     Consts.push_back(Cst->getZExtValue());
1700     if (!TII->fitsConstReadLimitations(Consts)) {
1701       return false;
1702     }
1703
1704     Sel = CstOffset;
1705     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
1706     return true;
1707   }
1708   case AMDGPU::MOV_IMM_I32:
1709   case AMDGPU::MOV_IMM_F32: {
1710     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
1711     uint64_t ImmValue = 0;
1712
1713
1714     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
1715       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
1716       float FloatValue = FPC->getValueAPF().convertToFloat();
1717       if (FloatValue == 0.0) {
1718         ImmReg = AMDGPU::ZERO;
1719       } else if (FloatValue == 0.5) {
1720         ImmReg = AMDGPU::HALF;
1721       } else if (FloatValue == 1.0) {
1722         ImmReg = AMDGPU::ONE;
1723       } else {
1724         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
1725       }
1726     } else {
1727       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
1728       uint64_t Value = C->getZExtValue();
1729       if (Value == 0) {
1730         ImmReg = AMDGPU::ZERO;
1731       } else if (Value == 1) {
1732         ImmReg = AMDGPU::ONE_INT;
1733       } else {
1734         ImmValue = Value;
1735       }
1736     }
1737
1738     // Check that we aren't already using an immediate.
1739     // XXX: It's possible for an instruction to have more than one
1740     // immediate operand, but this is not supported yet.
1741     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
1742       if (!Imm.getNode())
1743         return false;
1744       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
1745       assert(C);
1746       if (C->getZExtValue())
1747         return false;
1748       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
1749     }
1750     Src = DAG.getRegister(ImmReg, MVT::i32);
1751     return true;
1752   }
1753   default:
1754     return false;
1755   }
1756 }
1757
1758
1759 /// \brief Fold the instructions after selecting them
1760 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
1761                                             SelectionDAG &DAG) const {
1762   const R600InstrInfo *TII =
1763       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1764   if (!Node->isMachineOpcode())
1765     return Node;
1766   unsigned Opcode = Node->getMachineOpcode();
1767   SDValue FakeOp;
1768
1769   std::vector<SDValue> Ops;
1770   for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
1771               I != E; ++I)
1772           Ops.push_back(*I);
1773
1774   if (Opcode == AMDGPU::DOT_4) {
1775     int OperandIdx[] = {
1776       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1777       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1778       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1779       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1780       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1781       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1782       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1783       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1784         };
1785     int NegIdx[] = {
1786       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
1787       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
1788       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
1789       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
1790       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
1791       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
1792       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
1793       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
1794     };
1795     int AbsIdx[] = {
1796       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
1797       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
1798       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
1799       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
1800       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
1801       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
1802       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
1803       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
1804     };
1805     for (unsigned i = 0; i < 8; i++) {
1806       if (OperandIdx[i] < 0)
1807         return Node;
1808       SDValue &Src = Ops[OperandIdx[i] - 1];
1809       SDValue &Neg = Ops[NegIdx[i] - 1];
1810       SDValue &Abs = Ops[AbsIdx[i] - 1];
1811       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1812       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1813       if (HasDst)
1814         SelIdx--;
1815       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1816       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
1817         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1818     }
1819   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
1820     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
1821       SDValue &Src = Ops[i];
1822       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
1823         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1824     }
1825   } else {
1826     if (!TII->hasInstrModifiers(Opcode))
1827       return Node;
1828     int OperandIdx[] = {
1829       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1830       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1831       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
1832     };
1833     int NegIdx[] = {
1834       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
1835       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
1836       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
1837     };
1838     int AbsIdx[] = {
1839       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
1840       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
1841       -1
1842     };
1843     for (unsigned i = 0; i < 3; i++) {
1844       if (OperandIdx[i] < 0)
1845         return Node;
1846       SDValue &Src = Ops[OperandIdx[i] - 1];
1847       SDValue &Neg = Ops[NegIdx[i] - 1];
1848       SDValue FakeAbs;
1849       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
1850       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1851       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1852       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
1853       if (HasDst) {
1854         SelIdx--;
1855         ImmIdx--;
1856       }
1857       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1858       SDValue &Imm = Ops[ImmIdx];
1859       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
1860         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1861     }
1862   }
1863
1864   return Node;
1865 }