lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/CallingConvLower.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24 #include "llvm/IR/Argument.h"
  25 #include "llvm/IR/Function.h"
  26
  27 using namespace llvm;
  28
  29 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  30     AMDGPUTargetLowering(TM),
  31     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  36   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  37   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  38
  39   computeRegisterProperties();
  40
  41   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  42   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  43
  44   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  45   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  46
  47   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  48   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  49
  50   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  51
  52   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  53   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  54   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  55
  56   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  57   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  58
  59   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  60   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  61   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  62
  63   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  64   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  65   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  66   setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
  67   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
  69
  70   // Legalize loads and stores to the private address space.
  71   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  72   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  73   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  74   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
  75   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
  76   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  77   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
  78   setOperationAction(ISD::STORE, MVT::i8, Custom);
  79   setOperationAction(ISD::STORE, MVT::i32, Custom);
  80   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  81   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  82   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
  83   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
  84
  85   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  86   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  87   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
  88
  89   setTargetDAGCombine(ISD::FP_ROUND);
  90   setTargetDAGCombine(ISD::FP_TO_SINT);
  91   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  92   setTargetDAGCombine(ISD::SELECT_CC);
  93   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
  94
  95   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
  96
  97   setBooleanContents(ZeroOrNegativeOneBooleanContent);
  98   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  99   setSchedulingPreference(Sched::Source);
 100 }
 101
 102 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 103     MachineInstr * MI, MachineBasicBlock * BB) const {
 104   MachineFunction * MF = BB->getParent();
 105   MachineRegisterInfo &MRI = MF->getRegInfo();
 106   MachineBasicBlock::iterator I = *MI;
 107   const R600InstrInfo *TII =
 108     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 109
 110   switch (MI->getOpcode()) {
 111   default:
 112     if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::LDS_1A) {
 113       MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 114                                           TII->get(MI->getOpcode()),
 115                                           AMDGPU::OQAP);
 116       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 117         NewMI.addOperand(MI->getOperand(i));
 118       }
 119       TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
 120                                    MI->getOperand(0).getReg(),
 121                                    AMDGPU::OQAP);
 122     } else {
 123       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 124     }
 125     break;
 126   case AMDGPU::CLAMP_R600: {
 127     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 128                                                    AMDGPU::MOV,
 129                                                    MI->getOperand(0).getReg(),
 130                                                    MI->getOperand(1).getReg());
 131     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 132     break;
 133   }
 134
 135   case AMDGPU::FABS_R600: {
 136     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 137                                                     AMDGPU::MOV,
 138                                                     MI->getOperand(0).getReg(),
 139                                                     MI->getOperand(1).getReg());
 140     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 141     break;
 142   }
 143
 144   case AMDGPU::FNEG_R600: {
 145     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 146                                                     AMDGPU::MOV,
 147                                                     MI->getOperand(0).getReg(),
 148                                                     MI->getOperand(1).getReg());
 149     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 150     break;
 151   }
 152
 153   case AMDGPU::MASK_WRITE: {
 154     unsigned maskedRegister = MI->getOperand(0).getReg();
 155     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 156     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 157     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 158     break;
 159   }
 160
 161   case AMDGPU::MOV_IMM_F32:
 162     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 163                      MI->getOperand(1).getFPImm()->getValueAPF()
 164                          .bitcastToAPInt().getZExtValue());
 165     break;
 166   case AMDGPU::MOV_IMM_I32:
 167     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 168                      MI->getOperand(1).getImm());
 169     break;
 170   case AMDGPU::CONST_COPY: {
 171     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 172         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 173     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 174         MI->getOperand(1).getImm());
 175     break;
 176   }
 177
 178   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 179   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 180   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 181     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 182
 183     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 184             .addOperand(MI->getOperand(0))
 185             .addOperand(MI->getOperand(1))
 186             .addImm(EOP); // Set End of program bit
 187     break;
 188   }
 189
 190   case AMDGPU::TXD: {
 191     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 192     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 193     MachineOperand &RID = MI->getOperand(4);
 194     MachineOperand &SID = MI->getOperand(5);
 195     unsigned TextureId = MI->getOperand(6).getImm();
 196     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 197     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 198
 199     switch (TextureId) {
 200     case 5: // Rect
 201       CTX = CTY = 0;
 202       break;
 203     case 6: // Shadow1D
 204       SrcW = SrcZ;
 205       break;
 206     case 7: // Shadow2D
 207       SrcW = SrcZ;
 208       break;
 209     case 8: // ShadowRect
 210       CTX = CTY = 0;
 211       SrcW = SrcZ;
 212       break;
 213     case 9: // 1DArray
 214       SrcZ = SrcY;
 215       CTZ = 0;
 216       break;
 217     case 10: // 2DArray
 218       CTZ = 0;
 219       break;
 220     case 11: // Shadow1DArray
 221       SrcZ = SrcY;
 222       CTZ = 0;
 223       break;
 224     case 12: // Shadow2DArray
 225       CTZ = 0;
 226       break;
 227     }
 228     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 229             .addOperand(MI->getOperand(3))
 230             .addImm(SrcX)
 231             .addImm(SrcY)
 232             .addImm(SrcZ)
 233             .addImm(SrcW)
 234             .addImm(0)
 235             .addImm(0)
 236             .addImm(0)
 237             .addImm(0)
 238             .addImm(1)
 239             .addImm(2)
 240             .addImm(3)
 241             .addOperand(RID)
 242             .addOperand(SID)
 243             .addImm(CTX)
 244             .addImm(CTY)
 245             .addImm(CTZ)
 246             .addImm(CTW);
 247     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 248             .addOperand(MI->getOperand(2))
 249             .addImm(SrcX)
 250             .addImm(SrcY)
 251             .addImm(SrcZ)
 252             .addImm(SrcW)
 253             .addImm(0)
 254             .addImm(0)
 255             .addImm(0)
 256             .addImm(0)
 257             .addImm(1)
 258             .addImm(2)
 259             .addImm(3)
 260             .addOperand(RID)
 261             .addOperand(SID)
 262             .addImm(CTX)
 263             .addImm(CTY)
 264             .addImm(CTZ)
 265             .addImm(CTW);
 266     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 267             .addOperand(MI->getOperand(0))
 268             .addOperand(MI->getOperand(1))
 269             .addImm(SrcX)
 270             .addImm(SrcY)
 271             .addImm(SrcZ)
 272             .addImm(SrcW)
 273             .addImm(0)
 274             .addImm(0)
 275             .addImm(0)
 276             .addImm(0)
 277             .addImm(1)
 278             .addImm(2)
 279             .addImm(3)
 280             .addOperand(RID)
 281             .addOperand(SID)
 282             .addImm(CTX)
 283             .addImm(CTY)
 284             .addImm(CTZ)
 285             .addImm(CTW)
 286             .addReg(T0, RegState::Implicit)
 287             .addReg(T1, RegState::Implicit);
 288     break;
 289   }
 290
 291   case AMDGPU::TXD_SHADOW: {
 292     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 293     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 294     MachineOperand &RID = MI->getOperand(4);
 295     MachineOperand &SID = MI->getOperand(5);
 296     unsigned TextureId = MI->getOperand(6).getImm();
 297     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 298     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 299
 300     switch (TextureId) {
 301     case 5: // Rect
 302       CTX = CTY = 0;
 303       break;
 304     case 6: // Shadow1D
 305       SrcW = SrcZ;
 306       break;
 307     case 7: // Shadow2D
 308       SrcW = SrcZ;
 309       break;
 310     case 8: // ShadowRect
 311       CTX = CTY = 0;
 312       SrcW = SrcZ;
 313       break;
 314     case 9: // 1DArray
 315       SrcZ = SrcY;
 316       CTZ = 0;
 317       break;
 318     case 10: // 2DArray
 319       CTZ = 0;
 320       break;
 321     case 11: // Shadow1DArray
 322       SrcZ = SrcY;
 323       CTZ = 0;
 324       break;
 325     case 12: // Shadow2DArray
 326       CTZ = 0;
 327       break;
 328     }
 329
 330     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 331             .addOperand(MI->getOperand(3))
 332             .addImm(SrcX)
 333             .addImm(SrcY)
 334             .addImm(SrcZ)
 335             .addImm(SrcW)
 336             .addImm(0)
 337             .addImm(0)
 338             .addImm(0)
 339             .addImm(0)
 340             .addImm(1)
 341             .addImm(2)
 342             .addImm(3)
 343             .addOperand(RID)
 344             .addOperand(SID)
 345             .addImm(CTX)
 346             .addImm(CTY)
 347             .addImm(CTZ)
 348             .addImm(CTW);
 349     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 350             .addOperand(MI->getOperand(2))
 351             .addImm(SrcX)
 352             .addImm(SrcY)
 353             .addImm(SrcZ)
 354             .addImm(SrcW)
 355             .addImm(0)
 356             .addImm(0)
 357             .addImm(0)
 358             .addImm(0)
 359             .addImm(1)
 360             .addImm(2)
 361             .addImm(3)
 362             .addOperand(RID)
 363             .addOperand(SID)
 364             .addImm(CTX)
 365             .addImm(CTY)
 366             .addImm(CTZ)
 367             .addImm(CTW);
 368     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 369             .addOperand(MI->getOperand(0))
 370             .addOperand(MI->getOperand(1))
 371             .addImm(SrcX)
 372             .addImm(SrcY)
 373             .addImm(SrcZ)
 374             .addImm(SrcW)
 375             .addImm(0)
 376             .addImm(0)
 377             .addImm(0)
 378             .addImm(0)
 379             .addImm(1)
 380             .addImm(2)
 381             .addImm(3)
 382             .addOperand(RID)
 383             .addOperand(SID)
 384             .addImm(CTX)
 385             .addImm(CTY)
 386             .addImm(CTZ)
 387             .addImm(CTW)
 388             .addReg(T0, RegState::Implicit)
 389             .addReg(T1, RegState::Implicit);
 390     break;
 391   }
 392
 393   case AMDGPU::BRANCH:
 394       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 395               .addOperand(MI->getOperand(0));
 396       break;
 397
 398   case AMDGPU::BRANCH_COND_f32: {
 399     MachineInstr *NewMI =
 400       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 401               AMDGPU::PREDICATE_BIT)
 402               .addOperand(MI->getOperand(1))
 403               .addImm(OPCODE_IS_NOT_ZERO)
 404               .addImm(0); // Flags
 405     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 406     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 407             .addOperand(MI->getOperand(0))
 408             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 409     break;
 410   }
 411
 412   case AMDGPU::BRANCH_COND_i32: {
 413     MachineInstr *NewMI =
 414       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 415             AMDGPU::PREDICATE_BIT)
 416             .addOperand(MI->getOperand(1))
 417             .addImm(OPCODE_IS_NOT_ZERO_INT)
 418             .addImm(0); // Flags
 419     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 420     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 421            .addOperand(MI->getOperand(0))
 422             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 423     break;
 424   }
 425
 426   case AMDGPU::EG_ExportSwz:
 427   case AMDGPU::R600_ExportSwz: {
 428     // Instruction is left unmodified if its not the last one of its type
 429     bool isLastInstructionOfItsType = true;
 430     unsigned InstExportType = MI->getOperand(1).getImm();
 431     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 432          EndBlock = BB->end(); NextExportInst != EndBlock;
 433          NextExportInst = llvm::next(NextExportInst)) {
 434       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 435           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 436         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 437             .getImm();
 438         if (CurrentInstExportType == InstExportType) {
 439           isLastInstructionOfItsType = false;
 440           break;
 441         }
 442       }
 443     }
 444     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 445     if (!EOP && !isLastInstructionOfItsType)
 446       return BB;
 447     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 448     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 449             .addOperand(MI->getOperand(0))
 450             .addOperand(MI->getOperand(1))
 451             .addOperand(MI->getOperand(2))
 452             .addOperand(MI->getOperand(3))
 453             .addOperand(MI->getOperand(4))
 454             .addOperand(MI->getOperand(5))
 455             .addOperand(MI->getOperand(6))
 456             .addImm(CfInst)
 457             .addImm(EOP);
 458     break;
 459   }
 460   case AMDGPU::RETURN: {
 461     // RETURN instructions must have the live-out registers as implicit uses,
 462     // otherwise they appear dead.
 463     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 464     MachineInstrBuilder MIB(*MF, MI);
 465     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 466       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 467     return BB;
 468   }
 469   }
 470
 471   MI->eraseFromParent();
 472   return BB;
 473 }
 474
 475 //===----------------------------------------------------------------------===//
 476 // Custom DAG Lowering Operations
 477 //===----------------------------------------------------------------------===//
 478
 479 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 480   MachineFunction &MF = DAG.getMachineFunction();
 481   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 482   switch (Op.getOpcode()) {
 483   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 484   case ISD::FCOS:
 485   case ISD::FSIN: return LowerTrig(Op, DAG);
 486   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 487   case ISD::STORE: return LowerSTORE(Op, DAG);
 488   case ISD::LOAD: return LowerLOAD(Op, DAG);
 489   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 490   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 491   case ISD::INTRINSIC_VOID: {
 492     SDValue Chain = Op.getOperand(0);
 493     unsigned IntrinsicID =
 494                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 495     switch (IntrinsicID) {
 496     case AMDGPUIntrinsic::AMDGPU_store_output: {
 497       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 498       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 499       MFI->LiveOuts.push_back(Reg);
 500       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 501     }
 502     case AMDGPUIntrinsic::R600_store_swizzle: {
 503       const SDValue Args[8] = {
 504         Chain,
 505         Op.getOperand(2), // Export Value
 506         Op.getOperand(3), // ArrayBase
 507         Op.getOperand(4), // Type
 508         DAG.getConstant(0, MVT::i32), // SWZ_X
 509         DAG.getConstant(1, MVT::i32), // SWZ_Y
 510         DAG.getConstant(2, MVT::i32), // SWZ_Z
 511         DAG.getConstant(3, MVT::i32) // SWZ_W
 512       };
 513       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 514           Args, 8);
 515     }
 516
 517     // default for switch(IntrinsicID)
 518     default: break;
 519     }
 520     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 521     break;
 522   }
 523   case ISD::INTRINSIC_WO_CHAIN: {
 524     unsigned IntrinsicID =
 525                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 526     EVT VT = Op.getValueType();
 527     SDLoc DL(Op);
 528     switch(IntrinsicID) {
 529     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 530     case AMDGPUIntrinsic::R600_load_input: {
 531       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 532       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 533       MachineFunction &MF = DAG.getMachineFunction();
 534       MachineRegisterInfo &MRI = MF.getRegInfo();
 535       MRI.addLiveIn(Reg);
 536       return DAG.getCopyFromReg(DAG.getEntryNode(),
 537           SDLoc(DAG.getEntryNode()), Reg, VT);
 538     }
 539
 540     case AMDGPUIntrinsic::R600_interp_input: {
 541       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 542       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 543       MachineSDNode *interp;
 544       if (ijb < 0) {
 545         const MachineFunction &MF = DAG.getMachineFunction();
 546         const R600InstrInfo *TII =
 547           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 548         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 549             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 550         return DAG.getTargetExtractSubreg(
 551             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 552             DL, MVT::f32, SDValue(interp, 0));
 553       }
 554
 555       MachineFunction &MF = DAG.getMachineFunction();
 556       MachineRegisterInfo &MRI = MF.getRegInfo();
 557       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 558       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 559       MRI.addLiveIn(RegisterI);
 560       MRI.addLiveIn(RegisterJ);
 561       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 562           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 563       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 564           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 565
 566       if (slot % 4 < 2)
 567         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 568             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 569             RegisterJNode, RegisterINode);
 570       else
 571         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 572             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 573             RegisterJNode, RegisterINode);
 574       return SDValue(interp, slot % 2);
 575     }
 576     case AMDGPUIntrinsic::R600_tex:
 577     case AMDGPUIntrinsic::R600_texc:
 578     case AMDGPUIntrinsic::R600_txl:
 579     case AMDGPUIntrinsic::R600_txlc:
 580     case AMDGPUIntrinsic::R600_txb:
 581     case AMDGPUIntrinsic::R600_txbc:
 582     case AMDGPUIntrinsic::R600_txf:
 583     case AMDGPUIntrinsic::R600_txq:
 584     case AMDGPUIntrinsic::R600_ddx:
 585     case AMDGPUIntrinsic::R600_ddy: {
 586       unsigned TextureOp;
 587       switch (IntrinsicID) {
 588       case AMDGPUIntrinsic::R600_tex:
 589         TextureOp = 0;
 590         break;
 591       case AMDGPUIntrinsic::R600_texc:
 592         TextureOp = 1;
 593         break;
 594       case AMDGPUIntrinsic::R600_txl:
 595         TextureOp = 2;
 596         break;
 597       case AMDGPUIntrinsic::R600_txlc:
 598         TextureOp = 3;
 599         break;
 600       case AMDGPUIntrinsic::R600_txb:
 601         TextureOp = 4;
 602         break;
 603       case AMDGPUIntrinsic::R600_txbc:
 604         TextureOp = 5;
 605         break;
 606       case AMDGPUIntrinsic::R600_txf:
 607         TextureOp = 6;
 608         break;
 609       case AMDGPUIntrinsic::R600_txq:
 610         TextureOp = 7;
 611         break;
 612       case AMDGPUIntrinsic::R600_ddx:
 613         TextureOp = 8;
 614         break;
 615       case AMDGPUIntrinsic::R600_ddy:
 616         TextureOp = 9;
 617         break;
 618       default:
 619         llvm_unreachable("Unknow Texture Operation");
 620       }
 621
 622       SDValue TexArgs[19] = {
 623         DAG.getConstant(TextureOp, MVT::i32),
 624         Op.getOperand(1),
 625         DAG.getConstant(0, MVT::i32),
 626         DAG.getConstant(1, MVT::i32),
 627         DAG.getConstant(2, MVT::i32),
 628         DAG.getConstant(3, MVT::i32),
 629         Op.getOperand(2),
 630         Op.getOperand(3),
 631         Op.getOperand(4),
 632         DAG.getConstant(0, MVT::i32),
 633         DAG.getConstant(1, MVT::i32),
 634         DAG.getConstant(2, MVT::i32),
 635         DAG.getConstant(3, MVT::i32),
 636         Op.getOperand(5),
 637         Op.getOperand(6),
 638         Op.getOperand(7),
 639         Op.getOperand(8),
 640         Op.getOperand(9),
 641         Op.getOperand(10)
 642       };
 643       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 644     }
 645     case AMDGPUIntrinsic::AMDGPU_dp4: {
 646       SDValue Args[8] = {
 647       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 648           DAG.getConstant(0, MVT::i32)),
 649       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 650           DAG.getConstant(0, MVT::i32)),
 651       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 652           DAG.getConstant(1, MVT::i32)),
 653       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 654           DAG.getConstant(1, MVT::i32)),
 655       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 656           DAG.getConstant(2, MVT::i32)),
 657       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 658           DAG.getConstant(2, MVT::i32)),
 659       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 660           DAG.getConstant(3, MVT::i32)),
 661       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 662           DAG.getConstant(3, MVT::i32))
 663       };
 664       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 665     }
 666
 667     case Intrinsic::r600_read_ngroups_x:
 668       return LowerImplicitParameter(DAG, VT, DL, 0);
 669     case Intrinsic::r600_read_ngroups_y:
 670       return LowerImplicitParameter(DAG, VT, DL, 1);
 671     case Intrinsic::r600_read_ngroups_z:
 672       return LowerImplicitParameter(DAG, VT, DL, 2);
 673     case Intrinsic::r600_read_global_size_x:
 674       return LowerImplicitParameter(DAG, VT, DL, 3);
 675     case Intrinsic::r600_read_global_size_y:
 676       return LowerImplicitParameter(DAG, VT, DL, 4);
 677     case Intrinsic::r600_read_global_size_z:
 678       return LowerImplicitParameter(DAG, VT, DL, 5);
 679     case Intrinsic::r600_read_local_size_x:
 680       return LowerImplicitParameter(DAG, VT, DL, 6);
 681     case Intrinsic::r600_read_local_size_y:
 682       return LowerImplicitParameter(DAG, VT, DL, 7);
 683     case Intrinsic::r600_read_local_size_z:
 684       return LowerImplicitParameter(DAG, VT, DL, 8);
 685
 686     case Intrinsic::r600_read_tgid_x:
 687       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 688                                   AMDGPU::T1_X, VT);
 689     case Intrinsic::r600_read_tgid_y:
 690       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 691                                   AMDGPU::T1_Y, VT);
 692     case Intrinsic::r600_read_tgid_z:
 693       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 694                                   AMDGPU::T1_Z, VT);
 695     case Intrinsic::r600_read_tidig_x:
 696       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 697                                   AMDGPU::T0_X, VT);
 698     case Intrinsic::r600_read_tidig_y:
 699       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 700                                   AMDGPU::T0_Y, VT);
 701     case Intrinsic::r600_read_tidig_z:
 702       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 703                                   AMDGPU::T0_Z, VT);
 704     }
 705     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 706     break;
 707   }
 708   } // end switch(Op.getOpcode())
 709   return SDValue();
 710 }
 711
 712 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 713                                             SmallVectorImpl<SDValue> &Results,
 714                                             SelectionDAG &DAG) const {
 715   switch (N->getOpcode()) {
 716   default: return;
 717   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 718     return;
 719   case ISD::LOAD: {
 720     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 721     Results.push_back(SDValue(Node, 0));
 722     Results.push_back(SDValue(Node, 1));
 723     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 724     // function
 725     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 726     return;
 727   }
 728   case ISD::STORE:
 729     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 730     Results.push_back(SDValue(Node, 0));
 731     return;
 732   }
 733 }
 734
 735 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 736   // On hw >= R700, COS/SIN input must be between -1. and 1.
 737   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 738   EVT VT = Op.getValueType();
 739   SDValue Arg = Op.getOperand(0);
 740   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 741       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 742         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 743           DAG.getConstantFP(0.15915494309, MVT::f32)),
 744         DAG.getConstantFP(0.5, MVT::f32)));
 745   unsigned TrigNode;
 746   switch (Op.getOpcode()) {
 747   case ISD::FCOS:
 748     TrigNode = AMDGPUISD::COS_HW;
 749     break;
 750   case ISD::FSIN:
 751     TrigNode = AMDGPUISD::SIN_HW;
 752     break;
 753   default:
 754     llvm_unreachable("Wrong trig opcode");
 755   }
 756   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 757       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 758         DAG.getConstantFP(-0.5, MVT::f32)));
 759   if (Gen >= AMDGPUSubtarget::R700)
 760     return TrigVal;
 761   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 762   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 763       DAG.getConstantFP(3.14159265359, MVT::f32));
 764 }
 765
 766 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 767   return DAG.getNode(
 768       ISD::SETCC,
 769       SDLoc(Op),
 770       MVT::i1,
 771       Op, DAG.getConstantFP(0.0f, MVT::f32),
 772       DAG.getCondCode(ISD::SETNE)
 773       );
 774 }
 775
 776 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 777                                                    SDLoc DL,
 778                                                    unsigned DwordOffset) const {
 779   unsigned ByteOffset = DwordOffset * 4;
 780   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 781                                       AMDGPUAS::CONSTANT_BUFFER_0);
 782
 783   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 784   assert(isInt<16>(ByteOffset));
 785
 786   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 787                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 788                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 789                      false, false, false, 0);
 790 }
 791
 792 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 793
 794   MachineFunction &MF = DAG.getMachineFunction();
 795   const AMDGPUFrameLowering *TFL =
 796    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 797
 798   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 799   assert(FIN);
 800
 801   unsigned FrameIndex = FIN->getIndex();
 802   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 803   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 804 }
 805
 806 bool R600TargetLowering::isZero(SDValue Op) const {
 807   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 808     return Cst->isNullValue();
 809   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 810     return CstFP->isZero();
 811   } else {
 812     return false;
 813   }
 814 }
 815
 816 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 817   SDLoc DL(Op);
 818   EVT VT = Op.getValueType();
 819
 820   SDValue LHS = Op.getOperand(0);
 821   SDValue RHS = Op.getOperand(1);
 822   SDValue True = Op.getOperand(2);
 823   SDValue False = Op.getOperand(3);
 824   SDValue CC = Op.getOperand(4);
 825   SDValue Temp;
 826
 827   // LHS and RHS are guaranteed to be the same value type
 828   EVT CompareVT = LHS.getValueType();
 829
 830   // Check if we can lower this to a native operation.
 831
 832   // Try to lower to a SET* instruction:
 833   //
 834   // SET* can match the following patterns:
 835   //
 836   // select_cc f32, f32, -1,  0, cc_any
 837   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 838   // select_cc i32, i32, -1,  0, cc_any
 839   //
 840
 841   // Move hardware True/False values to the correct operand.
 842   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 843     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 844     std::swap(False, True);
 845     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 846   }
 847
 848   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 849       (CompareVT == VT || VT == MVT::i32)) {
 850     // This can be matched by a SET* instruction.
 851     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 852   }
 853
 854   // Try to lower to a CND* instruction:
 855   //
 856   // CND* can match the following patterns:
 857   //
 858   // select_cc f32, 0.0, f32, f32, cc_any
 859   // select_cc f32, 0.0, i32, i32, cc_any
 860   // select_cc i32, 0,   f32, f32, cc_any
 861   // select_cc i32, 0,   i32, i32, cc_any
 862   //
 863   if (isZero(LHS) || isZero(RHS)) {
 864     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 865     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 866     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 867     if (CompareVT != VT) {
 868       // Bitcast True / False to the correct types.  This will end up being
 869       // a nop, but it allows us to define only a single pattern in the
 870       // .TD files for each CND* instruction rather than having to have
 871       // one pattern for integer True/False and one for fp True/False
 872       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 873       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 874     }
 875     if (isZero(LHS)) {
 876       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 877     }
 878
 879     switch (CCOpcode) {
 880     case ISD::SETONE:
 881     case ISD::SETUNE:
 882     case ISD::SETNE:
 883     case ISD::SETULE:
 884     case ISD::SETULT:
 885     case ISD::SETOLE:
 886     case ISD::SETOLT:
 887     case ISD::SETLE:
 888     case ISD::SETLT:
 889       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 890       Temp = True;
 891       True = False;
 892       False = Temp;
 893       break;
 894     default:
 895       break;
 896     }
 897     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 898         Cond, Zero,
 899         True, False,
 900         DAG.getCondCode(CCOpcode));
 901     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 902   }
 903
 904
 905   // Possible Min/Max pattern
 906   SDValue MinMax = LowerMinMax(Op, DAG);
 907   if (MinMax.getNode()) {
 908     return MinMax;
 909   }
 910
 911   // If we make it this for it means we have no native instructions to handle
 912   // this SELECT_CC, so we must lower it.
 913   SDValue HWTrue, HWFalse;
 914
 915   if (CompareVT == MVT::f32) {
 916     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 917     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 918   } else if (CompareVT == MVT::i32) {
 919     HWTrue = DAG.getConstant(-1, CompareVT);
 920     HWFalse = DAG.getConstant(0, CompareVT);
 921   }
 922   else {
 923     assert(!"Unhandled value type in LowerSELECT_CC");
 924   }
 925
 926   // Lower this unsupported SELECT_CC into a combination of two supported
 927   // SELECT_CC operations.
 928   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 929
 930   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 931       Cond, HWFalse,
 932       True, False,
 933       DAG.getCondCode(ISD::SETNE));
 934 }
 935
 936 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 937 /// convert these pointers to a register index.  Each register holds
 938 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 939 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 940 /// for indirect addressing.
 941 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 942                                                unsigned StackWidth,
 943                                                SelectionDAG &DAG) const {
 944   unsigned SRLPad;
 945   switch(StackWidth) {
 946   case 1:
 947     SRLPad = 2;
 948     break;
 949   case 2:
 950     SRLPad = 3;
 951     break;
 952   case 4:
 953     SRLPad = 4;
 954     break;
 955   default: llvm_unreachable("Invalid stack width");
 956   }
 957
 958   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
 959                      DAG.getConstant(SRLPad, MVT::i32));
 960 }
 961
 962 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 963                                          unsigned ElemIdx,
 964                                          unsigned &Channel,
 965                                          unsigned &PtrIncr) const {
 966   switch (StackWidth) {
 967   default:
 968   case 1:
 969     Channel = 0;
 970     if (ElemIdx > 0) {
 971       PtrIncr = 1;
 972     } else {
 973       PtrIncr = 0;
 974     }
 975     break;
 976   case 2:
 977     Channel = ElemIdx % 2;
 978     if (ElemIdx == 2) {
 979       PtrIncr = 1;
 980     } else {
 981       PtrIncr = 0;
 982     }
 983     break;
 984   case 4:
 985     Channel = ElemIdx;
 986     PtrIncr = 0;
 987     break;
 988   }
 989 }
 990
 991 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 992   SDLoc DL(Op);
 993   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 994   SDValue Chain = Op.getOperand(0);
 995   SDValue Value = Op.getOperand(1);
 996   SDValue Ptr = Op.getOperand(2);
 997
 998   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
 999   if (Result.getNode()) {
1000     return Result;
1001   }
1002
1003   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1004     if (StoreNode->isTruncatingStore()) {
1005       EVT VT = Value.getValueType();
1006       assert(VT.bitsLE(MVT::i32));
1007       EVT MemVT = StoreNode->getMemoryVT();
1008       SDValue MaskConstant;
1009       if (MemVT == MVT::i8) {
1010         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1011       } else {
1012         assert(MemVT == MVT::i16);
1013         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1014       }
1015       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1016                                       DAG.getConstant(2, MVT::i32));
1017       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1018                                       DAG.getConstant(0x00000003, VT));
1019       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1020       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1021                                    DAG.getConstant(3, VT));
1022       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1023       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1024       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1025       // vector instead.
1026       SDValue Src[4] = {
1027         ShiftedValue,
1028         DAG.getConstant(0, MVT::i32),
1029         DAG.getConstant(0, MVT::i32),
1030         Mask
1031       };
1032       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1033       SDValue Args[3] = { Chain, Input, DWordAddr };
1034       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1035                                      Op->getVTList(), Args, 3, MemVT,
1036                                      StoreNode->getMemOperand());
1037     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1038                Value.getValueType().bitsGE(MVT::i32)) {
1039       // Convert pointer from byte address to dword address.
1040       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1041                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1042                                     Ptr, DAG.getConstant(2, MVT::i32)));
1043
1044       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1045         assert(!"Truncated and indexed stores not supported yet");
1046       } else {
1047         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1048       }
1049       return Chain;
1050     }
1051   }
1052
1053   EVT ValueVT = Value.getValueType();
1054
1055   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1056     return SDValue();
1057   }
1058
1059   // Lowering for indirect addressing
1060
1061   const MachineFunction &MF = DAG.getMachineFunction();
1062   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1063                                          getTargetMachine().getFrameLowering());
1064   unsigned StackWidth = TFL->getStackWidth(MF);
1065
1066   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1067
1068   if (ValueVT.isVector()) {
1069     unsigned NumElemVT = ValueVT.getVectorNumElements();
1070     EVT ElemVT = ValueVT.getVectorElementType();
1071     SDValue Stores[4];
1072
1073     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1074                                       "vector width in load");
1075
1076     for (unsigned i = 0; i < NumElemVT; ++i) {
1077       unsigned Channel, PtrIncr;
1078       getStackAddress(StackWidth, i, Channel, PtrIncr);
1079       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1080                         DAG.getConstant(PtrIncr, MVT::i32));
1081       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1082                                  Value, DAG.getConstant(i, MVT::i32));
1083
1084       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1085                               Chain, Elem, Ptr,
1086                               DAG.getTargetConstant(Channel, MVT::i32));
1087     }
1088      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1089    } else {
1090     if (ValueVT == MVT::i8) {
1091       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1092     }
1093     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1094     DAG.getTargetConstant(0, MVT::i32)); // Channel
1095   }
1096
1097   return Chain;
1098 }
1099
1100 // return (512 + (kc_bank << 12)
1101 static int
1102 ConstantAddressBlock(unsigned AddressSpace) {
1103   switch (AddressSpace) {
1104   case AMDGPUAS::CONSTANT_BUFFER_0:
1105     return 512;
1106   case AMDGPUAS::CONSTANT_BUFFER_1:
1107     return 512 + 4096;
1108   case AMDGPUAS::CONSTANT_BUFFER_2:
1109     return 512 + 4096 * 2;
1110   case AMDGPUAS::CONSTANT_BUFFER_3:
1111     return 512 + 4096 * 3;
1112   case AMDGPUAS::CONSTANT_BUFFER_4:
1113     return 512 + 4096 * 4;
1114   case AMDGPUAS::CONSTANT_BUFFER_5:
1115     return 512 + 4096 * 5;
1116   case AMDGPUAS::CONSTANT_BUFFER_6:
1117     return 512 + 4096 * 6;
1118   case AMDGPUAS::CONSTANT_BUFFER_7:
1119     return 512 + 4096 * 7;
1120   case AMDGPUAS::CONSTANT_BUFFER_8:
1121     return 512 + 4096 * 8;
1122   case AMDGPUAS::CONSTANT_BUFFER_9:
1123     return 512 + 4096 * 9;
1124   case AMDGPUAS::CONSTANT_BUFFER_10:
1125     return 512 + 4096 * 10;
1126   case AMDGPUAS::CONSTANT_BUFFER_11:
1127     return 512 + 4096 * 11;
1128   case AMDGPUAS::CONSTANT_BUFFER_12:
1129     return 512 + 4096 * 12;
1130   case AMDGPUAS::CONSTANT_BUFFER_13:
1131     return 512 + 4096 * 13;
1132   case AMDGPUAS::CONSTANT_BUFFER_14:
1133     return 512 + 4096 * 14;
1134   case AMDGPUAS::CONSTANT_BUFFER_15:
1135     return 512 + 4096 * 15;
1136   default:
1137     return -1;
1138   }
1139 }
1140
1141 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1142 {
1143   EVT VT = Op.getValueType();
1144   SDLoc DL(Op);
1145   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1146   SDValue Chain = Op.getOperand(0);
1147   SDValue Ptr = Op.getOperand(1);
1148   SDValue LoweredLoad;
1149
1150   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1151     SDValue MergedValues[2] = {
1152       SplitVectorLoad(Op, DAG),
1153       Chain
1154     };
1155     return DAG.getMergeValues(MergedValues, 2, DL);
1156   }
1157
1158   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1159   if (ConstantBlock > -1) {
1160     SDValue Result;
1161     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1162         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1163         dyn_cast<ConstantSDNode>(Ptr)) {
1164       SDValue Slots[4];
1165       for (unsigned i = 0; i < 4; i++) {
1166         // We want Const position encoded with the following formula :
1167         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1168         // const_index is Ptr computed by llvm using an alignment of 16.
1169         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1170         // then div by 4 at the ISel step
1171         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1172             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1173         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1174       }
1175       EVT NewVT = MVT::v4i32;
1176       unsigned NumElements = 4;
1177       if (VT.isVector()) {
1178         NewVT = VT;
1179         NumElements = VT.getVectorNumElements();
1180       }
1181       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1182     } else {
1183       // non constant ptr cant be folded, keeps it as a v4f32 load
1184       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1185           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1186           DAG.getConstant(LoadNode->getAddressSpace() -
1187                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1188           );
1189     }
1190
1191     if (!VT.isVector()) {
1192       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1193           DAG.getConstant(0, MVT::i32));
1194     }
1195
1196     SDValue MergedValues[2] = {
1197         Result,
1198         Chain
1199     };
1200     return DAG.getMergeValues(MergedValues, 2, DL);
1201   }
1202
1203   // For most operations returning SDValue() will result int he node being
1204   // expanded by the DAG Legalizer.  This is not the case for ISD::LOAD, so
1205   // we need to manually expand loads that may be legal in some address spaces
1206   // and illegal in others.  SEXT loads from CONSTANT_BUFFER_0 are supported
1207   // for compute shaders, since the data is sign extended when it is uploaded
1208   // to the buffer.  Howerver SEXT loads from other addresspaces are not
1209   // supported, so we need to expand them here.
1210   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1211     EVT MemVT = LoadNode->getMemoryVT();
1212     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1213     SDValue ShiftAmount =
1214           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1215     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1216                                   LoadNode->getPointerInfo(), MemVT,
1217                                   LoadNode->isVolatile(),
1218                                   LoadNode->isNonTemporal(),
1219                                   LoadNode->getAlignment());
1220     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1221     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1222
1223     SDValue MergedValues[2] = { Sra, Chain };
1224     return DAG.getMergeValues(MergedValues, 2, DL);
1225   }
1226
1227   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1228     return SDValue();
1229   }
1230
1231   // Lowering for indirect addressing
1232   const MachineFunction &MF = DAG.getMachineFunction();
1233   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1234                                          getTargetMachine().getFrameLowering());
1235   unsigned StackWidth = TFL->getStackWidth(MF);
1236
1237   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1238
1239   if (VT.isVector()) {
1240     unsigned NumElemVT = VT.getVectorNumElements();
1241     EVT ElemVT = VT.getVectorElementType();
1242     SDValue Loads[4];
1243
1244     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1245                                       "vector width in load");
1246
1247     for (unsigned i = 0; i < NumElemVT; ++i) {
1248       unsigned Channel, PtrIncr;
1249       getStackAddress(StackWidth, i, Channel, PtrIncr);
1250       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1251                         DAG.getConstant(PtrIncr, MVT::i32));
1252       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1253                              Chain, Ptr,
1254                              DAG.getTargetConstant(Channel, MVT::i32),
1255                              Op.getOperand(2));
1256     }
1257     for (unsigned i = NumElemVT; i < 4; ++i) {
1258       Loads[i] = DAG.getUNDEF(ElemVT);
1259     }
1260     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1261     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1262   } else {
1263     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1264                               Chain, Ptr,
1265                               DAG.getTargetConstant(0, MVT::i32), // Channel
1266                               Op.getOperand(2));
1267   }
1268
1269   SDValue Ops[2];
1270   Ops[0] = LoweredLoad;
1271   Ops[1] = Chain;
1272
1273   return DAG.getMergeValues(Ops, 2, DL);
1274 }
1275
1276 /// XXX Only kernel functions are supported, so we can assume for now that
1277 /// every function is a kernel function, but in the future we should use
1278 /// separate calling conventions for kernel and non-kernel functions.
1279 SDValue R600TargetLowering::LowerFormalArguments(
1280                                       SDValue Chain,
1281                                       CallingConv::ID CallConv,
1282                                       bool isVarArg,
1283                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1284                                       SDLoc DL, SelectionDAG &DAG,
1285                                       SmallVectorImpl<SDValue> &InVals) const {
1286   SmallVector<CCValAssign, 16> ArgLocs;
1287   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1288                  getTargetMachine(), ArgLocs, *DAG.getContext());
1289
1290   AnalyzeFormalArguments(CCInfo, Ins);
1291
1292   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1293     CCValAssign &VA = ArgLocs[i];
1294     EVT VT = VA.getLocVT();
1295
1296     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1297                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1298
1299     // The first 36 bytes of the input buffer contains information about
1300     // thread group and global sizes.
1301     SDValue Arg = DAG.getLoad(VT, DL, Chain,
1302                            DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1303                            MachinePointerInfo(UndefValue::get(PtrTy)), false,
1304                            false, false, 4); // 4 is the prefered alignment for
1305                                              // the CONSTANT memory space.
1306     InVals.push_back(Arg);
1307   }
1308   return Chain;
1309 }
1310
1311 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1312    if (!VT.isVector()) return MVT::i32;
1313    return VT.changeVectorElementTypeToInteger();
1314 }
1315
1316 static SDValue
1317 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1318                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1319   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1320   assert(RemapSwizzle.empty());
1321   SDValue NewBldVec[4] = {
1322       VectorEntry.getOperand(0),
1323       VectorEntry.getOperand(1),
1324       VectorEntry.getOperand(2),
1325       VectorEntry.getOperand(3)
1326   };
1327
1328   for (unsigned i = 0; i < 4; i++) {
1329     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1330       if (C->isZero()) {
1331         RemapSwizzle[i] = 4; // SEL_0
1332         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1333       } else if (C->isExactlyValue(1.0)) {
1334         RemapSwizzle[i] = 5; // SEL_1
1335         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1336       }
1337     }
1338
1339     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1340       continue;
1341     for (unsigned j = 0; j < i; j++) {
1342       if (NewBldVec[i] == NewBldVec[j]) {
1343         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1344         RemapSwizzle[i] = j;
1345         break;
1346       }
1347     }
1348   }
1349
1350   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1351       VectorEntry.getValueType(), NewBldVec, 4);
1352 }
1353
1354 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1355                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1356   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1357   assert(RemapSwizzle.empty());
1358   SDValue NewBldVec[4] = {
1359       VectorEntry.getOperand(0),
1360       VectorEntry.getOperand(1),
1361       VectorEntry.getOperand(2),
1362       VectorEntry.getOperand(3)
1363   };
1364   bool isUnmovable[4] = { false, false, false, false };
1365   for (unsigned i = 0; i < 4; i++)
1366     RemapSwizzle[i] = i;
1367
1368   for (unsigned i = 0; i < 4; i++) {
1369     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1370       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1371           ->getZExtValue();
1372       if (!isUnmovable[Idx]) {
1373         // Swap i and Idx
1374         std::swap(NewBldVec[Idx], NewBldVec[i]);
1375         std::swap(RemapSwizzle[RemapSwizzle[Idx]], RemapSwizzle[RemapSwizzle[i]]);
1376       }
1377       isUnmovable[Idx] = true;
1378     }
1379   }
1380
1381   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1382       VectorEntry.getValueType(), NewBldVec, 4);
1383 }
1384
1385
1386 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1387 SDValue Swz[4], SelectionDAG &DAG) const {
1388   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1389   // Old -> New swizzle values
1390   DenseMap<unsigned, unsigned> SwizzleRemap;
1391
1392   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1393   for (unsigned i = 0; i < 4; i++) {
1394     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1395     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1396       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1397   }
1398
1399   SwizzleRemap.clear();
1400   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1401   for (unsigned i = 0; i < 4; i++) {
1402     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1403     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1404       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1405   }
1406
1407   return BuildVector;
1408 }
1409
1410
1411 //===----------------------------------------------------------------------===//
1412 // Custom DAG Optimizations
1413 //===----------------------------------------------------------------------===//
1414
1415 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1416                                               DAGCombinerInfo &DCI) const {
1417   SelectionDAG &DAG = DCI.DAG;
1418
1419   switch (N->getOpcode()) {
1420   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1421   case ISD::FP_ROUND: {
1422       SDValue Arg = N->getOperand(0);
1423       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1424         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1425                            Arg.getOperand(0));
1426       }
1427       break;
1428     }
1429
1430   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1431   // (i32 select_cc f32, f32, -1, 0 cc)
1432   //
1433   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1434   // this to one of the SET*_DX10 instructions.
1435   case ISD::FP_TO_SINT: {
1436     SDValue FNeg = N->getOperand(0);
1437     if (FNeg.getOpcode() != ISD::FNEG) {
1438       return SDValue();
1439     }
1440     SDValue SelectCC = FNeg.getOperand(0);
1441     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1442         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1443         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1444         !isHWTrueValue(SelectCC.getOperand(2)) ||
1445         !isHWFalseValue(SelectCC.getOperand(3))) {
1446       return SDValue();
1447     }
1448
1449     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1450                            SelectCC.getOperand(0), // LHS
1451                            SelectCC.getOperand(1), // RHS
1452                            DAG.getConstant(-1, MVT::i32), // True
1453                            DAG.getConstant(0, MVT::i32),  // Flase
1454                            SelectCC.getOperand(4)); // CC
1455
1456     break;
1457   }
1458
1459   // insert_vector_elt (build_vector elt0, …, eltN), NewEltIdx, idx
1460   // => build_vector elt0, …, NewEltIdx, …, eltN
1461   case ISD::INSERT_VECTOR_ELT: {
1462     SDValue InVec = N->getOperand(0);
1463     SDValue InVal = N->getOperand(1);
1464     SDValue EltNo = N->getOperand(2);
1465     SDLoc dl(N);
1466
1467     // If the inserted element is an UNDEF, just use the input vector.
1468     if (InVal.getOpcode() == ISD::UNDEF)
1469       return InVec;
1470
1471     EVT VT = InVec.getValueType();
1472
1473     // If we can't generate a legal BUILD_VECTOR, exit
1474     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1475       return SDValue();
1476
1477     // Check that we know which element is being inserted
1478     if (!isa<ConstantSDNode>(EltNo))
1479       return SDValue();
1480     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1481
1482     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1483     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1484     // vector elements.
1485     SmallVector<SDValue, 8> Ops;
1486     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1487       Ops.append(InVec.getNode()->op_begin(),
1488                  InVec.getNode()->op_end());
1489     } else if (InVec.getOpcode() == ISD::UNDEF) {
1490       unsigned NElts = VT.getVectorNumElements();
1491       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1492     } else {
1493       return SDValue();
1494     }
1495
1496     // Insert the element
1497     if (Elt < Ops.size()) {
1498       // All the operands of BUILD_VECTOR must have the same type;
1499       // we enforce that here.
1500       EVT OpVT = Ops[0].getValueType();
1501       if (InVal.getValueType() != OpVT)
1502         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1503           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1504           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1505       Ops[Elt] = InVal;
1506     }
1507
1508     // Return the new vector
1509     return DAG.getNode(ISD::BUILD_VECTOR, dl,
1510                        VT, &Ops[0], Ops.size());
1511   }
1512
1513   // Extract_vec (Build_vector) generated by custom lowering
1514   // also needs to be customly combined
1515   case ISD::EXTRACT_VECTOR_ELT: {
1516     SDValue Arg = N->getOperand(0);
1517     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1518       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1519         unsigned Element = Const->getZExtValue();
1520         return Arg->getOperand(Element);
1521       }
1522     }
1523     if (Arg.getOpcode() == ISD::BITCAST &&
1524         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1525       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1526         unsigned Element = Const->getZExtValue();
1527         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1528             Arg->getOperand(0).getOperand(Element));
1529       }
1530     }
1531   }
1532
1533   case ISD::SELECT_CC: {
1534     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1535     //      selectcc x, y, a, b, inv(cc)
1536     //
1537     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1538     //      selectcc x, y, a, b, cc
1539     SDValue LHS = N->getOperand(0);
1540     if (LHS.getOpcode() != ISD::SELECT_CC) {
1541       return SDValue();
1542     }
1543
1544     SDValue RHS = N->getOperand(1);
1545     SDValue True = N->getOperand(2);
1546     SDValue False = N->getOperand(3);
1547     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1548
1549     if (LHS.getOperand(2).getNode() != True.getNode() ||
1550         LHS.getOperand(3).getNode() != False.getNode() ||
1551         RHS.getNode() != False.getNode()) {
1552       return SDValue();
1553     }
1554
1555     switch (NCC) {
1556     default: return SDValue();
1557     case ISD::SETNE: return LHS;
1558     case ISD::SETEQ: {
1559       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1560       LHSCC = ISD::getSetCCInverse(LHSCC,
1561                                   LHS.getOperand(0).getValueType().isInteger());
1562       return DAG.getSelectCC(SDLoc(N),
1563                              LHS.getOperand(0),
1564                              LHS.getOperand(1),
1565                              LHS.getOperand(2),
1566                              LHS.getOperand(3),
1567                              LHSCC);
1568     }
1569     }
1570   }
1571
1572   case AMDGPUISD::EXPORT: {
1573     SDValue Arg = N->getOperand(1);
1574     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1575       break;
1576
1577     SDValue NewArgs[8] = {
1578       N->getOperand(0), // Chain
1579       SDValue(),
1580       N->getOperand(2), // ArrayBase
1581       N->getOperand(3), // Type
1582       N->getOperand(4), // SWZ_X
1583       N->getOperand(5), // SWZ_Y
1584       N->getOperand(6), // SWZ_Z
1585       N->getOperand(7) // SWZ_W
1586     };
1587     SDLoc DL(N);
1588     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1589     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1590   }
1591   case AMDGPUISD::TEXTURE_FETCH: {
1592     SDValue Arg = N->getOperand(1);
1593     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1594       break;
1595
1596     SDValue NewArgs[19] = {
1597       N->getOperand(0),
1598       N->getOperand(1),
1599       N->getOperand(2),
1600       N->getOperand(3),
1601       N->getOperand(4),
1602       N->getOperand(5),
1603       N->getOperand(6),
1604       N->getOperand(7),
1605       N->getOperand(8),
1606       N->getOperand(9),
1607       N->getOperand(10),
1608       N->getOperand(11),
1609       N->getOperand(12),
1610       N->getOperand(13),
1611       N->getOperand(14),
1612       N->getOperand(15),
1613       N->getOperand(16),
1614       N->getOperand(17),
1615       N->getOperand(18),
1616     };
1617     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1618     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1619         NewArgs, 19);
1620   }
1621   }
1622   return SDValue();
1623 }