lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/CallingConvLower.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24 #include "llvm/IR/Argument.h"
  25 #include "llvm/IR/Function.h"
  26
  27 using namespace llvm;
  28
  29 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  30     AMDGPUTargetLowering(TM),
  31     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  36   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  37   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  38
  39   computeRegisterProperties();
  40
  41   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  42   setOperationAction(ISD::FADD, MVT::v2f32, Expand);
  43   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  44   setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
  45   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  46   setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
  47   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  48   setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
  49
  50   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  51   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  52
  53   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  54   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  55
  56   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  57   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  58
  59   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  60
  61   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  62   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  63   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  64
  65   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  66   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  67
  68   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  69   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  70   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  71
  72   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  73   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  74
  75   // Legalize loads and stores to the private address space.
  76   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  77   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  78   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  79   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
  80   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
  81   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  82   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
  83   setOperationAction(ISD::STORE, MVT::i8, Custom);
  84   setOperationAction(ISD::STORE, MVT::i32, Custom);
  85   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  86   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  87   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
  88   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
  89
  90   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  91   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  92   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
  93
  94   setTargetDAGCombine(ISD::FP_ROUND);
  95   setTargetDAGCombine(ISD::FP_TO_SINT);
  96   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  97   setTargetDAGCombine(ISD::SELECT_CC);
  98   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
  99
 100   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 101
 102   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 103   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 104   setSchedulingPreference(Sched::Source);
 105 }
 106
 107 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 108     MachineInstr * MI, MachineBasicBlock * BB) const {
 109   MachineFunction * MF = BB->getParent();
 110   MachineRegisterInfo &MRI = MF->getRegInfo();
 111   MachineBasicBlock::iterator I = *MI;
 112   const R600InstrInfo *TII =
 113     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 114
 115   switch (MI->getOpcode()) {
 116   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 117   case AMDGPU::CLAMP_R600: {
 118     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 119                                                    AMDGPU::MOV,
 120                                                    MI->getOperand(0).getReg(),
 121                                                    MI->getOperand(1).getReg());
 122     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 123     break;
 124   }
 125
 126   case AMDGPU::FABS_R600: {
 127     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 128                                                     AMDGPU::MOV,
 129                                                     MI->getOperand(0).getReg(),
 130                                                     MI->getOperand(1).getReg());
 131     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 132     break;
 133   }
 134
 135   case AMDGPU::FNEG_R600: {
 136     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 137                                                     AMDGPU::MOV,
 138                                                     MI->getOperand(0).getReg(),
 139                                                     MI->getOperand(1).getReg());
 140     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 141     break;
 142   }
 143
 144   case AMDGPU::MASK_WRITE: {
 145     unsigned maskedRegister = MI->getOperand(0).getReg();
 146     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 147     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 148     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 149     break;
 150   }
 151
 152   case AMDGPU::LDS_READ_RET: {
 153     MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 154                                         TII->get(MI->getOpcode()),
 155                                         AMDGPU::OQAP);
 156     for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 157       NewMI.addOperand(MI->getOperand(i));
 158     }
 159     TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
 160                                  MI->getOperand(0).getReg(),
 161                                  AMDGPU::OQAP);
 162     break;
 163   }
 164
 165   case AMDGPU::MOV_IMM_F32:
 166     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 167                      MI->getOperand(1).getFPImm()->getValueAPF()
 168                          .bitcastToAPInt().getZExtValue());
 169     break;
 170   case AMDGPU::MOV_IMM_I32:
 171     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 172                      MI->getOperand(1).getImm());
 173     break;
 174   case AMDGPU::CONST_COPY: {
 175     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 176         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 177     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 178         MI->getOperand(1).getImm());
 179     break;
 180   }
 181
 182   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 183   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 184   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 185     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 186
 187     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 188             .addOperand(MI->getOperand(0))
 189             .addOperand(MI->getOperand(1))
 190             .addImm(EOP); // Set End of program bit
 191     break;
 192   }
 193
 194   case AMDGPU::TXD: {
 195     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 196     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 197     MachineOperand &RID = MI->getOperand(4);
 198     MachineOperand &SID = MI->getOperand(5);
 199     unsigned TextureId = MI->getOperand(6).getImm();
 200     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 201     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 202
 203     switch (TextureId) {
 204     case 5: // Rect
 205       CTX = CTY = 0;
 206       break;
 207     case 6: // Shadow1D
 208       SrcW = SrcZ;
 209       break;
 210     case 7: // Shadow2D
 211       SrcW = SrcZ;
 212       break;
 213     case 8: // ShadowRect
 214       CTX = CTY = 0;
 215       SrcW = SrcZ;
 216       break;
 217     case 9: // 1DArray
 218       SrcZ = SrcY;
 219       CTZ = 0;
 220       break;
 221     case 10: // 2DArray
 222       CTZ = 0;
 223       break;
 224     case 11: // Shadow1DArray
 225       SrcZ = SrcY;
 226       CTZ = 0;
 227       break;
 228     case 12: // Shadow2DArray
 229       CTZ = 0;
 230       break;
 231     }
 232     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 233             .addOperand(MI->getOperand(3))
 234             .addImm(SrcX)
 235             .addImm(SrcY)
 236             .addImm(SrcZ)
 237             .addImm(SrcW)
 238             .addImm(0)
 239             .addImm(0)
 240             .addImm(0)
 241             .addImm(0)
 242             .addImm(1)
 243             .addImm(2)
 244             .addImm(3)
 245             .addOperand(RID)
 246             .addOperand(SID)
 247             .addImm(CTX)
 248             .addImm(CTY)
 249             .addImm(CTZ)
 250             .addImm(CTW);
 251     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 252             .addOperand(MI->getOperand(2))
 253             .addImm(SrcX)
 254             .addImm(SrcY)
 255             .addImm(SrcZ)
 256             .addImm(SrcW)
 257             .addImm(0)
 258             .addImm(0)
 259             .addImm(0)
 260             .addImm(0)
 261             .addImm(1)
 262             .addImm(2)
 263             .addImm(3)
 264             .addOperand(RID)
 265             .addOperand(SID)
 266             .addImm(CTX)
 267             .addImm(CTY)
 268             .addImm(CTZ)
 269             .addImm(CTW);
 270     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 271             .addOperand(MI->getOperand(0))
 272             .addOperand(MI->getOperand(1))
 273             .addImm(SrcX)
 274             .addImm(SrcY)
 275             .addImm(SrcZ)
 276             .addImm(SrcW)
 277             .addImm(0)
 278             .addImm(0)
 279             .addImm(0)
 280             .addImm(0)
 281             .addImm(1)
 282             .addImm(2)
 283             .addImm(3)
 284             .addOperand(RID)
 285             .addOperand(SID)
 286             .addImm(CTX)
 287             .addImm(CTY)
 288             .addImm(CTZ)
 289             .addImm(CTW)
 290             .addReg(T0, RegState::Implicit)
 291             .addReg(T1, RegState::Implicit);
 292     break;
 293   }
 294
 295   case AMDGPU::TXD_SHADOW: {
 296     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 297     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 298     MachineOperand &RID = MI->getOperand(4);
 299     MachineOperand &SID = MI->getOperand(5);
 300     unsigned TextureId = MI->getOperand(6).getImm();
 301     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 302     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 303
 304     switch (TextureId) {
 305     case 5: // Rect
 306       CTX = CTY = 0;
 307       break;
 308     case 6: // Shadow1D
 309       SrcW = SrcZ;
 310       break;
 311     case 7: // Shadow2D
 312       SrcW = SrcZ;
 313       break;
 314     case 8: // ShadowRect
 315       CTX = CTY = 0;
 316       SrcW = SrcZ;
 317       break;
 318     case 9: // 1DArray
 319       SrcZ = SrcY;
 320       CTZ = 0;
 321       break;
 322     case 10: // 2DArray
 323       CTZ = 0;
 324       break;
 325     case 11: // Shadow1DArray
 326       SrcZ = SrcY;
 327       CTZ = 0;
 328       break;
 329     case 12: // Shadow2DArray
 330       CTZ = 0;
 331       break;
 332     }
 333
 334     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 335             .addOperand(MI->getOperand(3))
 336             .addImm(SrcX)
 337             .addImm(SrcY)
 338             .addImm(SrcZ)
 339             .addImm(SrcW)
 340             .addImm(0)
 341             .addImm(0)
 342             .addImm(0)
 343             .addImm(0)
 344             .addImm(1)
 345             .addImm(2)
 346             .addImm(3)
 347             .addOperand(RID)
 348             .addOperand(SID)
 349             .addImm(CTX)
 350             .addImm(CTY)
 351             .addImm(CTZ)
 352             .addImm(CTW);
 353     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 354             .addOperand(MI->getOperand(2))
 355             .addImm(SrcX)
 356             .addImm(SrcY)
 357             .addImm(SrcZ)
 358             .addImm(SrcW)
 359             .addImm(0)
 360             .addImm(0)
 361             .addImm(0)
 362             .addImm(0)
 363             .addImm(1)
 364             .addImm(2)
 365             .addImm(3)
 366             .addOperand(RID)
 367             .addOperand(SID)
 368             .addImm(CTX)
 369             .addImm(CTY)
 370             .addImm(CTZ)
 371             .addImm(CTW);
 372     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 373             .addOperand(MI->getOperand(0))
 374             .addOperand(MI->getOperand(1))
 375             .addImm(SrcX)
 376             .addImm(SrcY)
 377             .addImm(SrcZ)
 378             .addImm(SrcW)
 379             .addImm(0)
 380             .addImm(0)
 381             .addImm(0)
 382             .addImm(0)
 383             .addImm(1)
 384             .addImm(2)
 385             .addImm(3)
 386             .addOperand(RID)
 387             .addOperand(SID)
 388             .addImm(CTX)
 389             .addImm(CTY)
 390             .addImm(CTZ)
 391             .addImm(CTW)
 392             .addReg(T0, RegState::Implicit)
 393             .addReg(T1, RegState::Implicit);
 394     break;
 395   }
 396
 397   case AMDGPU::BRANCH:
 398       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 399               .addOperand(MI->getOperand(0));
 400       break;
 401
 402   case AMDGPU::BRANCH_COND_f32: {
 403     MachineInstr *NewMI =
 404       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 405               AMDGPU::PREDICATE_BIT)
 406               .addOperand(MI->getOperand(1))
 407               .addImm(OPCODE_IS_NOT_ZERO)
 408               .addImm(0); // Flags
 409     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 410     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 411             .addOperand(MI->getOperand(0))
 412             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 413     break;
 414   }
 415
 416   case AMDGPU::BRANCH_COND_i32: {
 417     MachineInstr *NewMI =
 418       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 419             AMDGPU::PREDICATE_BIT)
 420             .addOperand(MI->getOperand(1))
 421             .addImm(OPCODE_IS_NOT_ZERO_INT)
 422             .addImm(0); // Flags
 423     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 424     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 425            .addOperand(MI->getOperand(0))
 426             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 427     break;
 428   }
 429
 430   case AMDGPU::EG_ExportSwz:
 431   case AMDGPU::R600_ExportSwz: {
 432     // Instruction is left unmodified if its not the last one of its type
 433     bool isLastInstructionOfItsType = true;
 434     unsigned InstExportType = MI->getOperand(1).getImm();
 435     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 436          EndBlock = BB->end(); NextExportInst != EndBlock;
 437          NextExportInst = llvm::next(NextExportInst)) {
 438       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 439           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 440         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 441             .getImm();
 442         if (CurrentInstExportType == InstExportType) {
 443           isLastInstructionOfItsType = false;
 444           break;
 445         }
 446       }
 447     }
 448     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 449     if (!EOP && !isLastInstructionOfItsType)
 450       return BB;
 451     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 452     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 453             .addOperand(MI->getOperand(0))
 454             .addOperand(MI->getOperand(1))
 455             .addOperand(MI->getOperand(2))
 456             .addOperand(MI->getOperand(3))
 457             .addOperand(MI->getOperand(4))
 458             .addOperand(MI->getOperand(5))
 459             .addOperand(MI->getOperand(6))
 460             .addImm(CfInst)
 461             .addImm(EOP);
 462     break;
 463   }
 464   case AMDGPU::RETURN: {
 465     // RETURN instructions must have the live-out registers as implicit uses,
 466     // otherwise they appear dead.
 467     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 468     MachineInstrBuilder MIB(*MF, MI);
 469     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 470       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 471     return BB;
 472   }
 473   }
 474
 475   MI->eraseFromParent();
 476   return BB;
 477 }
 478
 479 //===----------------------------------------------------------------------===//
 480 // Custom DAG Lowering Operations
 481 //===----------------------------------------------------------------------===//
 482
 483 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 484   MachineFunction &MF = DAG.getMachineFunction();
 485   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 486   switch (Op.getOpcode()) {
 487   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 488   case ISD::FCOS:
 489   case ISD::FSIN: return LowerTrig(Op, DAG);
 490   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 491   case ISD::SELECT: return LowerSELECT(Op, DAG);
 492   case ISD::STORE: return LowerSTORE(Op, DAG);
 493   case ISD::LOAD: return LowerLOAD(Op, DAG);
 494   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 495   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 496   case ISD::INTRINSIC_VOID: {
 497     SDValue Chain = Op.getOperand(0);
 498     unsigned IntrinsicID =
 499                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 500     switch (IntrinsicID) {
 501     case AMDGPUIntrinsic::AMDGPU_store_output: {
 502       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 503       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 504       MFI->LiveOuts.push_back(Reg);
 505       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 506     }
 507     case AMDGPUIntrinsic::R600_store_swizzle: {
 508       const SDValue Args[8] = {
 509         Chain,
 510         Op.getOperand(2), // Export Value
 511         Op.getOperand(3), // ArrayBase
 512         Op.getOperand(4), // Type
 513         DAG.getConstant(0, MVT::i32), // SWZ_X
 514         DAG.getConstant(1, MVT::i32), // SWZ_Y
 515         DAG.getConstant(2, MVT::i32), // SWZ_Z
 516         DAG.getConstant(3, MVT::i32) // SWZ_W
 517       };
 518       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 519           Args, 8);
 520     }
 521
 522     // default for switch(IntrinsicID)
 523     default: break;
 524     }
 525     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 526     break;
 527   }
 528   case ISD::INTRINSIC_WO_CHAIN: {
 529     unsigned IntrinsicID =
 530                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 531     EVT VT = Op.getValueType();
 532     SDLoc DL(Op);
 533     switch(IntrinsicID) {
 534     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 535     case AMDGPUIntrinsic::R600_load_input: {
 536       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 537       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 538       MachineFunction &MF = DAG.getMachineFunction();
 539       MachineRegisterInfo &MRI = MF.getRegInfo();
 540       MRI.addLiveIn(Reg);
 541       return DAG.getCopyFromReg(DAG.getEntryNode(),
 542           SDLoc(DAG.getEntryNode()), Reg, VT);
 543     }
 544
 545     case AMDGPUIntrinsic::R600_interp_input: {
 546       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 547       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 548       MachineSDNode *interp;
 549       if (ijb < 0) {
 550         const MachineFunction &MF = DAG.getMachineFunction();
 551         const R600InstrInfo *TII =
 552           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 553         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 554             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 555         return DAG.getTargetExtractSubreg(
 556             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 557             DL, MVT::f32, SDValue(interp, 0));
 558       }
 559
 560       MachineFunction &MF = DAG.getMachineFunction();
 561       MachineRegisterInfo &MRI = MF.getRegInfo();
 562       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 563       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 564       MRI.addLiveIn(RegisterI);
 565       MRI.addLiveIn(RegisterJ);
 566       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 567           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 568       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 569           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 570
 571       if (slot % 4 < 2)
 572         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 573             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 574             RegisterJNode, RegisterINode);
 575       else
 576         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 577             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 578             RegisterJNode, RegisterINode);
 579       return SDValue(interp, slot % 2);
 580     }
 581     case AMDGPUIntrinsic::R600_tex:
 582     case AMDGPUIntrinsic::R600_texc:
 583     case AMDGPUIntrinsic::R600_txl:
 584     case AMDGPUIntrinsic::R600_txlc:
 585     case AMDGPUIntrinsic::R600_txb:
 586     case AMDGPUIntrinsic::R600_txbc:
 587     case AMDGPUIntrinsic::R600_txf:
 588     case AMDGPUIntrinsic::R600_txq:
 589     case AMDGPUIntrinsic::R600_ddx:
 590     case AMDGPUIntrinsic::R600_ddy: {
 591       unsigned TextureOp;
 592       switch (IntrinsicID) {
 593       case AMDGPUIntrinsic::R600_tex:
 594         TextureOp = 0;
 595         break;
 596       case AMDGPUIntrinsic::R600_texc:
 597         TextureOp = 1;
 598         break;
 599       case AMDGPUIntrinsic::R600_txl:
 600         TextureOp = 2;
 601         break;
 602       case AMDGPUIntrinsic::R600_txlc:
 603         TextureOp = 3;
 604         break;
 605       case AMDGPUIntrinsic::R600_txb:
 606         TextureOp = 4;
 607         break;
 608       case AMDGPUIntrinsic::R600_txbc:
 609         TextureOp = 5;
 610         break;
 611       case AMDGPUIntrinsic::R600_txf:
 612         TextureOp = 6;
 613         break;
 614       case AMDGPUIntrinsic::R600_txq:
 615         TextureOp = 7;
 616         break;
 617       case AMDGPUIntrinsic::R600_ddx:
 618         TextureOp = 8;
 619         break;
 620       case AMDGPUIntrinsic::R600_ddy:
 621         TextureOp = 9;
 622         break;
 623       default:
 624         llvm_unreachable("Unknow Texture Operation");
 625       }
 626
 627       SDValue TexArgs[19] = {
 628         DAG.getConstant(TextureOp, MVT::i32),
 629         Op.getOperand(1),
 630         DAG.getConstant(0, MVT::i32),
 631         DAG.getConstant(1, MVT::i32),
 632         DAG.getConstant(2, MVT::i32),
 633         DAG.getConstant(3, MVT::i32),
 634         Op.getOperand(2),
 635         Op.getOperand(3),
 636         Op.getOperand(4),
 637         DAG.getConstant(0, MVT::i32),
 638         DAG.getConstant(1, MVT::i32),
 639         DAG.getConstant(2, MVT::i32),
 640         DAG.getConstant(3, MVT::i32),
 641         Op.getOperand(5),
 642         Op.getOperand(6),
 643         Op.getOperand(7),
 644         Op.getOperand(8),
 645         Op.getOperand(9),
 646         Op.getOperand(10)
 647       };
 648       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 649     }
 650     case AMDGPUIntrinsic::AMDGPU_dp4: {
 651       SDValue Args[8] = {
 652       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 653           DAG.getConstant(0, MVT::i32)),
 654       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 655           DAG.getConstant(0, MVT::i32)),
 656       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 657           DAG.getConstant(1, MVT::i32)),
 658       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 659           DAG.getConstant(1, MVT::i32)),
 660       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 661           DAG.getConstant(2, MVT::i32)),
 662       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 663           DAG.getConstant(2, MVT::i32)),
 664       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 665           DAG.getConstant(3, MVT::i32)),
 666       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 667           DAG.getConstant(3, MVT::i32))
 668       };
 669       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 670     }
 671
 672     case Intrinsic::r600_read_ngroups_x:
 673       return LowerImplicitParameter(DAG, VT, DL, 0);
 674     case Intrinsic::r600_read_ngroups_y:
 675       return LowerImplicitParameter(DAG, VT, DL, 1);
 676     case Intrinsic::r600_read_ngroups_z:
 677       return LowerImplicitParameter(DAG, VT, DL, 2);
 678     case Intrinsic::r600_read_global_size_x:
 679       return LowerImplicitParameter(DAG, VT, DL, 3);
 680     case Intrinsic::r600_read_global_size_y:
 681       return LowerImplicitParameter(DAG, VT, DL, 4);
 682     case Intrinsic::r600_read_global_size_z:
 683       return LowerImplicitParameter(DAG, VT, DL, 5);
 684     case Intrinsic::r600_read_local_size_x:
 685       return LowerImplicitParameter(DAG, VT, DL, 6);
 686     case Intrinsic::r600_read_local_size_y:
 687       return LowerImplicitParameter(DAG, VT, DL, 7);
 688     case Intrinsic::r600_read_local_size_z:
 689       return LowerImplicitParameter(DAG, VT, DL, 8);
 690
 691     case Intrinsic::r600_read_tgid_x:
 692       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 693                                   AMDGPU::T1_X, VT);
 694     case Intrinsic::r600_read_tgid_y:
 695       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 696                                   AMDGPU::T1_Y, VT);
 697     case Intrinsic::r600_read_tgid_z:
 698       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 699                                   AMDGPU::T1_Z, VT);
 700     case Intrinsic::r600_read_tidig_x:
 701       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 702                                   AMDGPU::T0_X, VT);
 703     case Intrinsic::r600_read_tidig_y:
 704       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 705                                   AMDGPU::T0_Y, VT);
 706     case Intrinsic::r600_read_tidig_z:
 707       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 708                                   AMDGPU::T0_Z, VT);
 709     }
 710     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 711     break;
 712   }
 713   } // end switch(Op.getOpcode())
 714   return SDValue();
 715 }
 716
 717 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 718                                             SmallVectorImpl<SDValue> &Results,
 719                                             SelectionDAG &DAG) const {
 720   switch (N->getOpcode()) {
 721   default: return;
 722   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 723     return;
 724   case ISD::LOAD: {
 725     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 726     Results.push_back(SDValue(Node, 0));
 727     Results.push_back(SDValue(Node, 1));
 728     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 729     // function
 730     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 731     return;
 732   }
 733   case ISD::STORE:
 734     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 735     Results.push_back(SDValue(Node, 0));
 736     return;
 737   }
 738 }
 739
 740 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 741   // On hw >= R700, COS/SIN input must be between -1. and 1.
 742   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 743   EVT VT = Op.getValueType();
 744   SDValue Arg = Op.getOperand(0);
 745   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 746       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 747         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 748           DAG.getConstantFP(0.15915494309, MVT::f32)),
 749         DAG.getConstantFP(0.5, MVT::f32)));
 750   unsigned TrigNode;
 751   switch (Op.getOpcode()) {
 752   case ISD::FCOS:
 753     TrigNode = AMDGPUISD::COS_HW;
 754     break;
 755   case ISD::FSIN:
 756     TrigNode = AMDGPUISD::SIN_HW;
 757     break;
 758   default:
 759     llvm_unreachable("Wrong trig opcode");
 760   }
 761   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 762       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 763         DAG.getConstantFP(-0.5, MVT::f32)));
 764   if (Gen >= AMDGPUSubtarget::R700)
 765     return TrigVal;
 766   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 767   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 768       DAG.getConstantFP(3.14159265359, MVT::f32));
 769 }
 770
 771 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 772   return DAG.getNode(
 773       ISD::SETCC,
 774       SDLoc(Op),
 775       MVT::i1,
 776       Op, DAG.getConstantFP(0.0f, MVT::f32),
 777       DAG.getCondCode(ISD::SETNE)
 778       );
 779 }
 780
 781 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 782                                                    SDLoc DL,
 783                                                    unsigned DwordOffset) const {
 784   unsigned ByteOffset = DwordOffset * 4;
 785   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 786                                       AMDGPUAS::CONSTANT_BUFFER_0);
 787
 788   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 789   assert(isInt<16>(ByteOffset));
 790
 791   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 792                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 793                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 794                      false, false, false, 0);
 795 }
 796
 797 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 798
 799   MachineFunction &MF = DAG.getMachineFunction();
 800   const AMDGPUFrameLowering *TFL =
 801    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 802
 803   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 804   assert(FIN);
 805
 806   unsigned FrameIndex = FIN->getIndex();
 807   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 808   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 809 }
 810
 811 bool R600TargetLowering::isZero(SDValue Op) const {
 812   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 813     return Cst->isNullValue();
 814   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 815     return CstFP->isZero();
 816   } else {
 817     return false;
 818   }
 819 }
 820
 821 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 822   SDLoc DL(Op);
 823   EVT VT = Op.getValueType();
 824
 825   SDValue LHS = Op.getOperand(0);
 826   SDValue RHS = Op.getOperand(1);
 827   SDValue True = Op.getOperand(2);
 828   SDValue False = Op.getOperand(3);
 829   SDValue CC = Op.getOperand(4);
 830   SDValue Temp;
 831
 832   // LHS and RHS are guaranteed to be the same value type
 833   EVT CompareVT = LHS.getValueType();
 834
 835   // Check if we can lower this to a native operation.
 836
 837   // Try to lower to a SET* instruction:
 838   //
 839   // SET* can match the following patterns:
 840   //
 841   // select_cc f32, f32, -1,  0, cc_any
 842   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 843   // select_cc i32, i32, -1,  0, cc_any
 844   //
 845
 846   // Move hardware True/False values to the correct operand.
 847   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 848     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 849     std::swap(False, True);
 850     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 851   }
 852
 853   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 854       (CompareVT == VT || VT == MVT::i32)) {
 855     // This can be matched by a SET* instruction.
 856     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 857   }
 858
 859   // Try to lower to a CND* instruction:
 860   //
 861   // CND* can match the following patterns:
 862   //
 863   // select_cc f32, 0.0, f32, f32, cc_any
 864   // select_cc f32, 0.0, i32, i32, cc_any
 865   // select_cc i32, 0,   f32, f32, cc_any
 866   // select_cc i32, 0,   i32, i32, cc_any
 867   //
 868   if (isZero(LHS) || isZero(RHS)) {
 869     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 870     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 871     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 872     if (CompareVT != VT) {
 873       // Bitcast True / False to the correct types.  This will end up being
 874       // a nop, but it allows us to define only a single pattern in the
 875       // .TD files for each CND* instruction rather than having to have
 876       // one pattern for integer True/False and one for fp True/False
 877       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 878       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 879     }
 880     if (isZero(LHS)) {
 881       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 882     }
 883
 884     switch (CCOpcode) {
 885     case ISD::SETONE:
 886     case ISD::SETUNE:
 887     case ISD::SETNE:
 888     case ISD::SETULE:
 889     case ISD::SETULT:
 890     case ISD::SETOLE:
 891     case ISD::SETOLT:
 892     case ISD::SETLE:
 893     case ISD::SETLT:
 894       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 895       Temp = True;
 896       True = False;
 897       False = Temp;
 898       break;
 899     default:
 900       break;
 901     }
 902     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 903         Cond, Zero,
 904         True, False,
 905         DAG.getCondCode(CCOpcode));
 906     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 907   }
 908
 909
 910   // Possible Min/Max pattern
 911   SDValue MinMax = LowerMinMax(Op, DAG);
 912   if (MinMax.getNode()) {
 913     return MinMax;
 914   }
 915
 916   // If we make it this for it means we have no native instructions to handle
 917   // this SELECT_CC, so we must lower it.
 918   SDValue HWTrue, HWFalse;
 919
 920   if (CompareVT == MVT::f32) {
 921     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 922     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 923   } else if (CompareVT == MVT::i32) {
 924     HWTrue = DAG.getConstant(-1, CompareVT);
 925     HWFalse = DAG.getConstant(0, CompareVT);
 926   }
 927   else {
 928     assert(!"Unhandled value type in LowerSELECT_CC");
 929   }
 930
 931   // Lower this unsupported SELECT_CC into a combination of two supported
 932   // SELECT_CC operations.
 933   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 934
 935   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 936       Cond, HWFalse,
 937       True, False,
 938       DAG.getCondCode(ISD::SETNE));
 939 }
 940
 941 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 942   return DAG.getNode(ISD::SELECT_CC,
 943       SDLoc(Op),
 944       Op.getValueType(),
 945       Op.getOperand(0),
 946       DAG.getConstant(0, MVT::i32),
 947       Op.getOperand(1),
 948       Op.getOperand(2),
 949       DAG.getCondCode(ISD::SETNE));
 950 }
 951
 952 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 953 /// convert these pointers to a register index.  Each register holds
 954 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 955 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 956 /// for indirect addressing.
 957 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 958                                                unsigned StackWidth,
 959                                                SelectionDAG &DAG) const {
 960   unsigned SRLPad;
 961   switch(StackWidth) {
 962   case 1:
 963     SRLPad = 2;
 964     break;
 965   case 2:
 966     SRLPad = 3;
 967     break;
 968   case 4:
 969     SRLPad = 4;
 970     break;
 971   default: llvm_unreachable("Invalid stack width");
 972   }
 973
 974   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
 975                      DAG.getConstant(SRLPad, MVT::i32));
 976 }
 977
 978 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 979                                          unsigned ElemIdx,
 980                                          unsigned &Channel,
 981                                          unsigned &PtrIncr) const {
 982   switch (StackWidth) {
 983   default:
 984   case 1:
 985     Channel = 0;
 986     if (ElemIdx > 0) {
 987       PtrIncr = 1;
 988     } else {
 989       PtrIncr = 0;
 990     }
 991     break;
 992   case 2:
 993     Channel = ElemIdx % 2;
 994     if (ElemIdx == 2) {
 995       PtrIncr = 1;
 996     } else {
 997       PtrIncr = 0;
 998     }
 999     break;
1000   case 4:
1001     Channel = ElemIdx;
1002     PtrIncr = 0;
1003     break;
1004   }
1005 }
1006
1007 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1008   SDLoc DL(Op);
1009   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1010   SDValue Chain = Op.getOperand(0);
1011   SDValue Value = Op.getOperand(1);
1012   SDValue Ptr = Op.getOperand(2);
1013
1014   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1015     if (StoreNode->isTruncatingStore()) {
1016       EVT VT = Value.getValueType();
1017       assert(VT == MVT::i32);
1018       EVT MemVT = StoreNode->getMemoryVT();
1019       SDValue MaskConstant;
1020       if (MemVT == MVT::i8) {
1021         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1022       } else {
1023         assert(MemVT == MVT::i16);
1024         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1025       }
1026       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1027                                       DAG.getConstant(2, MVT::i32));
1028       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1029                                       DAG.getConstant(0x00000003, VT));
1030       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1031       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1032                                    DAG.getConstant(3, VT));
1033       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1034       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1035       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1036       // vector instead.
1037       SDValue Src[4] = {
1038         ShiftedValue,
1039         DAG.getConstant(0, MVT::i32),
1040         DAG.getConstant(0, MVT::i32),
1041         Mask
1042       };
1043       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1044       SDValue Args[3] = { Chain, Input, DWordAddr };
1045       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1046                                      Op->getVTList(), Args, 3, MemVT,
1047                                      StoreNode->getMemOperand());
1048     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1049                Value.getValueType().bitsGE(MVT::i32)) {
1050       // Convert pointer from byte address to dword address.
1051       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1052                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1053                                     Ptr, DAG.getConstant(2, MVT::i32)));
1054
1055       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1056         assert(!"Truncated and indexed stores not supported yet");
1057       } else {
1058         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1059       }
1060       return Chain;
1061     }
1062   }
1063
1064   EVT ValueVT = Value.getValueType();
1065
1066   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1067     return SDValue();
1068   }
1069
1070   // Lowering for indirect addressing
1071
1072   const MachineFunction &MF = DAG.getMachineFunction();
1073   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1074                                          getTargetMachine().getFrameLowering());
1075   unsigned StackWidth = TFL->getStackWidth(MF);
1076
1077   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1078
1079   if (ValueVT.isVector()) {
1080     unsigned NumElemVT = ValueVT.getVectorNumElements();
1081     EVT ElemVT = ValueVT.getVectorElementType();
1082     SDValue Stores[4];
1083
1084     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1085                                       "vector width in load");
1086
1087     for (unsigned i = 0; i < NumElemVT; ++i) {
1088       unsigned Channel, PtrIncr;
1089       getStackAddress(StackWidth, i, Channel, PtrIncr);
1090       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1091                         DAG.getConstant(PtrIncr, MVT::i32));
1092       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1093                                  Value, DAG.getConstant(i, MVT::i32));
1094
1095       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1096                               Chain, Elem, Ptr,
1097                               DAG.getTargetConstant(Channel, MVT::i32));
1098     }
1099      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1100    } else {
1101     if (ValueVT == MVT::i8) {
1102       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1103     }
1104     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1105     DAG.getTargetConstant(0, MVT::i32)); // Channel
1106   }
1107
1108   return Chain;
1109 }
1110
1111 // return (512 + (kc_bank << 12)
1112 static int
1113 ConstantAddressBlock(unsigned AddressSpace) {
1114   switch (AddressSpace) {
1115   case AMDGPUAS::CONSTANT_BUFFER_0:
1116     return 512;
1117   case AMDGPUAS::CONSTANT_BUFFER_1:
1118     return 512 + 4096;
1119   case AMDGPUAS::CONSTANT_BUFFER_2:
1120     return 512 + 4096 * 2;
1121   case AMDGPUAS::CONSTANT_BUFFER_3:
1122     return 512 + 4096 * 3;
1123   case AMDGPUAS::CONSTANT_BUFFER_4:
1124     return 512 + 4096 * 4;
1125   case AMDGPUAS::CONSTANT_BUFFER_5:
1126     return 512 + 4096 * 5;
1127   case AMDGPUAS::CONSTANT_BUFFER_6:
1128     return 512 + 4096 * 6;
1129   case AMDGPUAS::CONSTANT_BUFFER_7:
1130     return 512 + 4096 * 7;
1131   case AMDGPUAS::CONSTANT_BUFFER_8:
1132     return 512 + 4096 * 8;
1133   case AMDGPUAS::CONSTANT_BUFFER_9:
1134     return 512 + 4096 * 9;
1135   case AMDGPUAS::CONSTANT_BUFFER_10:
1136     return 512 + 4096 * 10;
1137   case AMDGPUAS::CONSTANT_BUFFER_11:
1138     return 512 + 4096 * 11;
1139   case AMDGPUAS::CONSTANT_BUFFER_12:
1140     return 512 + 4096 * 12;
1141   case AMDGPUAS::CONSTANT_BUFFER_13:
1142     return 512 + 4096 * 13;
1143   case AMDGPUAS::CONSTANT_BUFFER_14:
1144     return 512 + 4096 * 14;
1145   case AMDGPUAS::CONSTANT_BUFFER_15:
1146     return 512 + 4096 * 15;
1147   default:
1148     return -1;
1149   }
1150 }
1151
1152 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1153 {
1154   EVT VT = Op.getValueType();
1155   SDLoc DL(Op);
1156   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1157   SDValue Chain = Op.getOperand(0);
1158   SDValue Ptr = Op.getOperand(1);
1159   SDValue LoweredLoad;
1160
1161   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1162   if (ConstantBlock > -1) {
1163     SDValue Result;
1164     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1165         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1166         dyn_cast<ConstantSDNode>(Ptr)) {
1167       SDValue Slots[4];
1168       for (unsigned i = 0; i < 4; i++) {
1169         // We want Const position encoded with the following formula :
1170         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1171         // const_index is Ptr computed by llvm using an alignment of 16.
1172         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1173         // then div by 4 at the ISel step
1174         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1175             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1176         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1177       }
1178       EVT NewVT = MVT::v4i32;
1179       unsigned NumElements = 4;
1180       if (VT.isVector()) {
1181         NewVT = VT;
1182         NumElements = VT.getVectorNumElements();
1183       }
1184       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1185     } else {
1186       // non constant ptr cant be folded, keeps it as a v4f32 load
1187       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1188           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1189           DAG.getConstant(LoadNode->getAddressSpace() -
1190                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1191           );
1192     }
1193
1194     if (!VT.isVector()) {
1195       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1196           DAG.getConstant(0, MVT::i32));
1197     }
1198
1199     SDValue MergedValues[2] = {
1200         Result,
1201         Chain
1202     };
1203     return DAG.getMergeValues(MergedValues, 2, DL);
1204   }
1205
1206   // For most operations returning SDValue() will result int he node being
1207   // expanded by the DAG Legalizer.  This is not the case for ISD::LOAD, so
1208   // we need to manually expand loads that may be legal in some address spaces
1209   // and illegal in others.  SEXT loads from CONSTANT_BUFFER_0 are supported
1210   // for compute shaders, since the data is sign extended when it is uploaded
1211   // to the buffer.  Howerver SEXT loads from other addresspaces are not
1212   // supported, so we need to expand them here.
1213   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1214     EVT MemVT = LoadNode->getMemoryVT();
1215     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1216     SDValue ShiftAmount =
1217           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1218     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1219                                   LoadNode->getPointerInfo(), MemVT,
1220                                   LoadNode->isVolatile(),
1221                                   LoadNode->isNonTemporal(),
1222                                   LoadNode->getAlignment());
1223     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1224     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1225
1226     SDValue MergedValues[2] = { Sra, Chain };
1227     return DAG.getMergeValues(MergedValues, 2, DL);
1228   }
1229
1230   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1231     return SDValue();
1232   }
1233
1234   // Lowering for indirect addressing
1235   const MachineFunction &MF = DAG.getMachineFunction();
1236   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1237                                          getTargetMachine().getFrameLowering());
1238   unsigned StackWidth = TFL->getStackWidth(MF);
1239
1240   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1241
1242   if (VT.isVector()) {
1243     unsigned NumElemVT = VT.getVectorNumElements();
1244     EVT ElemVT = VT.getVectorElementType();
1245     SDValue Loads[4];
1246
1247     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1248                                       "vector width in load");
1249
1250     for (unsigned i = 0; i < NumElemVT; ++i) {
1251       unsigned Channel, PtrIncr;
1252       getStackAddress(StackWidth, i, Channel, PtrIncr);
1253       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1254                         DAG.getConstant(PtrIncr, MVT::i32));
1255       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1256                              Chain, Ptr,
1257                              DAG.getTargetConstant(Channel, MVT::i32),
1258                              Op.getOperand(2));
1259     }
1260     for (unsigned i = NumElemVT; i < 4; ++i) {
1261       Loads[i] = DAG.getUNDEF(ElemVT);
1262     }
1263     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1264     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1265   } else {
1266     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1267                               Chain, Ptr,
1268                               DAG.getTargetConstant(0, MVT::i32), // Channel
1269                               Op.getOperand(2));
1270   }
1271
1272   SDValue Ops[2];
1273   Ops[0] = LoweredLoad;
1274   Ops[1] = Chain;
1275
1276   return DAG.getMergeValues(Ops, 2, DL);
1277 }
1278
1279 /// XXX Only kernel functions are supported, so we can assume for now that
1280 /// every function is a kernel function, but in the future we should use
1281 /// separate calling conventions for kernel and non-kernel functions.
1282 SDValue R600TargetLowering::LowerFormalArguments(
1283                                       SDValue Chain,
1284                                       CallingConv::ID CallConv,
1285                                       bool isVarArg,
1286                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1287                                       SDLoc DL, SelectionDAG &DAG,
1288                                       SmallVectorImpl<SDValue> &InVals) const {
1289   SmallVector<CCValAssign, 16> ArgLocs;
1290   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1291                  getTargetMachine(), ArgLocs, *DAG.getContext());
1292
1293   AnalyzeFormalArguments(CCInfo, Ins);
1294
1295   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1296     CCValAssign &VA = ArgLocs[i];
1297     EVT VT = VA.getLocVT();
1298
1299     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1300                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1301
1302     // The first 36 bytes of the input buffer contains information about
1303     // thread group and global sizes.
1304     SDValue Arg = DAG.getLoad(VT, DL, Chain,
1305                            DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1306                            MachinePointerInfo(UndefValue::get(PtrTy)), false,
1307                            false, false, 4); // 4 is the prefered alignment for
1308                                              // the CONSTANT memory space.
1309     InVals.push_back(Arg);
1310   }
1311   return Chain;
1312 }
1313
1314 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1315    if (!VT.isVector()) return MVT::i32;
1316    return VT.changeVectorElementTypeToInteger();
1317 }
1318
1319 static SDValue
1320 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1321                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1322   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1323   assert(RemapSwizzle.empty());
1324   SDValue NewBldVec[4] = {
1325       VectorEntry.getOperand(0),
1326       VectorEntry.getOperand(1),
1327       VectorEntry.getOperand(2),
1328       VectorEntry.getOperand(3)
1329   };
1330
1331   for (unsigned i = 0; i < 4; i++) {
1332     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1333       if (C->isZero()) {
1334         RemapSwizzle[i] = 4; // SEL_0
1335         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1336       } else if (C->isExactlyValue(1.0)) {
1337         RemapSwizzle[i] = 5; // SEL_1
1338         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1339       }
1340     }
1341
1342     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1343       continue;
1344     for (unsigned j = 0; j < i; j++) {
1345       if (NewBldVec[i] == NewBldVec[j]) {
1346         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1347         RemapSwizzle[i] = j;
1348         break;
1349       }
1350     }
1351   }
1352
1353   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1354       VectorEntry.getValueType(), NewBldVec, 4);
1355 }
1356
1357 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1358                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1359   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1360   assert(RemapSwizzle.empty());
1361   SDValue NewBldVec[4] = {
1362       VectorEntry.getOperand(0),
1363       VectorEntry.getOperand(1),
1364       VectorEntry.getOperand(2),
1365       VectorEntry.getOperand(3)
1366   };
1367   bool isUnmovable[4] = { false, false, false, false };
1368   for (unsigned i = 0; i < 4; i++)
1369     RemapSwizzle[i] = i;
1370
1371   for (unsigned i = 0; i < 4; i++) {
1372     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1373       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1374           ->getZExtValue();
1375       if (!isUnmovable[Idx]) {
1376         // Swap i and Idx
1377         std::swap(NewBldVec[Idx], NewBldVec[i]);
1378         std::swap(RemapSwizzle[RemapSwizzle[Idx]], RemapSwizzle[RemapSwizzle[i]]);
1379       }
1380       isUnmovable[Idx] = true;
1381     }
1382   }
1383
1384   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1385       VectorEntry.getValueType(), NewBldVec, 4);
1386 }
1387
1388
1389 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1390 SDValue Swz[4], SelectionDAG &DAG) const {
1391   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1392   // Old -> New swizzle values
1393   DenseMap<unsigned, unsigned> SwizzleRemap;
1394
1395   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1396   for (unsigned i = 0; i < 4; i++) {
1397     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1398     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1399       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1400   }
1401
1402   SwizzleRemap.clear();
1403   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1404   for (unsigned i = 0; i < 4; i++) {
1405     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1406     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1407       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1408   }
1409
1410   return BuildVector;
1411 }
1412
1413
1414 //===----------------------------------------------------------------------===//
1415 // Custom DAG Optimizations
1416 //===----------------------------------------------------------------------===//
1417
1418 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1419                                               DAGCombinerInfo &DCI) const {
1420   SelectionDAG &DAG = DCI.DAG;
1421
1422   switch (N->getOpcode()) {
1423   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1424   case ISD::FP_ROUND: {
1425       SDValue Arg = N->getOperand(0);
1426       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1427         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1428                            Arg.getOperand(0));
1429       }
1430       break;
1431     }
1432
1433   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1434   // (i32 select_cc f32, f32, -1, 0 cc)
1435   //
1436   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1437   // this to one of the SET*_DX10 instructions.
1438   case ISD::FP_TO_SINT: {
1439     SDValue FNeg = N->getOperand(0);
1440     if (FNeg.getOpcode() != ISD::FNEG) {
1441       return SDValue();
1442     }
1443     SDValue SelectCC = FNeg.getOperand(0);
1444     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1445         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1446         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1447         !isHWTrueValue(SelectCC.getOperand(2)) ||
1448         !isHWFalseValue(SelectCC.getOperand(3))) {
1449       return SDValue();
1450     }
1451
1452     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1453                            SelectCC.getOperand(0), // LHS
1454                            SelectCC.getOperand(1), // RHS
1455                            DAG.getConstant(-1, MVT::i32), // True
1456                            DAG.getConstant(0, MVT::i32),  // Flase
1457                            SelectCC.getOperand(4)); // CC
1458
1459     break;
1460   }
1461
1462   // insert_vector_elt (build_vector elt0, …, eltN), NewEltIdx, idx
1463   // => build_vector elt0, …, NewEltIdx, …, eltN
1464   case ISD::INSERT_VECTOR_ELT: {
1465     SDValue InVec = N->getOperand(0);
1466     SDValue InVal = N->getOperand(1);
1467     SDValue EltNo = N->getOperand(2);
1468     SDLoc dl(N);
1469
1470     // If the inserted element is an UNDEF, just use the input vector.
1471     if (InVal.getOpcode() == ISD::UNDEF)
1472       return InVec;
1473
1474     EVT VT = InVec.getValueType();
1475
1476     // If we can't generate a legal BUILD_VECTOR, exit
1477     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1478       return SDValue();
1479
1480     // Check that we know which element is being inserted
1481     if (!isa<ConstantSDNode>(EltNo))
1482       return SDValue();
1483     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1484
1485     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1486     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1487     // vector elements.
1488     SmallVector<SDValue, 8> Ops;
1489     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1490       Ops.append(InVec.getNode()->op_begin(),
1491                  InVec.getNode()->op_end());
1492     } else if (InVec.getOpcode() == ISD::UNDEF) {
1493       unsigned NElts = VT.getVectorNumElements();
1494       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1495     } else {
1496       return SDValue();
1497     }
1498
1499     // Insert the element
1500     if (Elt < Ops.size()) {
1501       // All the operands of BUILD_VECTOR must have the same type;
1502       // we enforce that here.
1503       EVT OpVT = Ops[0].getValueType();
1504       if (InVal.getValueType() != OpVT)
1505         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1506           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1507           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1508       Ops[Elt] = InVal;
1509     }
1510
1511     // Return the new vector
1512     return DAG.getNode(ISD::BUILD_VECTOR, dl,
1513                        VT, &Ops[0], Ops.size());
1514   }
1515
1516   // Extract_vec (Build_vector) generated by custom lowering
1517   // also needs to be customly combined
1518   case ISD::EXTRACT_VECTOR_ELT: {
1519     SDValue Arg = N->getOperand(0);
1520     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1521       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1522         unsigned Element = Const->getZExtValue();
1523         return Arg->getOperand(Element);
1524       }
1525     }
1526     if (Arg.getOpcode() == ISD::BITCAST &&
1527         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1528       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1529         unsigned Element = Const->getZExtValue();
1530         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1531             Arg->getOperand(0).getOperand(Element));
1532       }
1533     }
1534   }
1535
1536   case ISD::SELECT_CC: {
1537     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1538     //      selectcc x, y, a, b, inv(cc)
1539     //
1540     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1541     //      selectcc x, y, a, b, cc
1542     SDValue LHS = N->getOperand(0);
1543     if (LHS.getOpcode() != ISD::SELECT_CC) {
1544       return SDValue();
1545     }
1546
1547     SDValue RHS = N->getOperand(1);
1548     SDValue True = N->getOperand(2);
1549     SDValue False = N->getOperand(3);
1550     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1551
1552     if (LHS.getOperand(2).getNode() != True.getNode() ||
1553         LHS.getOperand(3).getNode() != False.getNode() ||
1554         RHS.getNode() != False.getNode()) {
1555       return SDValue();
1556     }
1557
1558     switch (NCC) {
1559     default: return SDValue();
1560     case ISD::SETNE: return LHS;
1561     case ISD::SETEQ: {
1562       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1563       LHSCC = ISD::getSetCCInverse(LHSCC,
1564                                   LHS.getOperand(0).getValueType().isInteger());
1565       return DAG.getSelectCC(SDLoc(N),
1566                              LHS.getOperand(0),
1567                              LHS.getOperand(1),
1568                              LHS.getOperand(2),
1569                              LHS.getOperand(3),
1570                              LHSCC);
1571     }
1572     }
1573   }
1574   case AMDGPUISD::EXPORT: {
1575     SDValue Arg = N->getOperand(1);
1576     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1577       break;
1578
1579     SDValue NewArgs[8] = {
1580       N->getOperand(0), // Chain
1581       SDValue(),
1582       N->getOperand(2), // ArrayBase
1583       N->getOperand(3), // Type
1584       N->getOperand(4), // SWZ_X
1585       N->getOperand(5), // SWZ_Y
1586       N->getOperand(6), // SWZ_Z
1587       N->getOperand(7) // SWZ_W
1588     };
1589     SDLoc DL(N);
1590     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1591     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1592   }
1593   case AMDGPUISD::TEXTURE_FETCH: {
1594     SDValue Arg = N->getOperand(1);
1595     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1596       break;
1597
1598     SDValue NewArgs[19] = {
1599       N->getOperand(0),
1600       N->getOperand(1),
1601       N->getOperand(2),
1602       N->getOperand(3),
1603       N->getOperand(4),
1604       N->getOperand(5),
1605       N->getOperand(6),
1606       N->getOperand(7),
1607       N->getOperand(8),
1608       N->getOperand(9),
1609       N->getOperand(10),
1610       N->getOperand(11),
1611       N->getOperand(12),
1612       N->getOperand(13),
1613       N->getOperand(14),
1614       N->getOperand(15),
1615       N->getOperand(16),
1616       N->getOperand(17),
1617       N->getOperand(18),
1618     };
1619     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1620     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1621         NewArgs, 19);
1622   }
1623   }
1624   return SDValue();
1625 }