lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/CallingConvLower.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24 #include "llvm/IR/Argument.h"
  25 #include "llvm/IR/Function.h"
  26
  27 using namespace llvm;
  28
  29 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  30     AMDGPUTargetLowering(TM),
  31     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  36   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  37   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  38
  39   computeRegisterProperties();
  40
  41   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  42   setOperationAction(ISD::FADD, MVT::v2f32, Expand);
  43   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  44   setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
  45   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  46   setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
  47   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  48   setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
  49
  50   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  51   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  52
  53   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  54   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  55
  56   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  57   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  58
  59   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  60
  61   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  62   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  63   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  64
  65   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  66   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  67
  68   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  69   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  70   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  71
  72   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  73   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  74
  75   // Legalize loads and stores to the private address space.
  76   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  77   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  78   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  79   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
  80   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
  81   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  82   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
  83   setOperationAction(ISD::STORE, MVT::i8, Custom);
  84   setOperationAction(ISD::STORE, MVT::i32, Custom);
  85   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  86   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  87   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
  88   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
  89
  90   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  91   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  92   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
  93
  94   setTargetDAGCombine(ISD::FP_ROUND);
  95   setTargetDAGCombine(ISD::FP_TO_SINT);
  96   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  97   setTargetDAGCombine(ISD::SELECT_CC);
  98   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
  99
 100   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 101
 102   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 103   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 104   setSchedulingPreference(Sched::Source);
 105 }
 106
 107 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 108     MachineInstr * MI, MachineBasicBlock * BB) const {
 109   MachineFunction * MF = BB->getParent();
 110   MachineRegisterInfo &MRI = MF->getRegInfo();
 111   MachineBasicBlock::iterator I = *MI;
 112   const R600InstrInfo *TII =
 113     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 114
 115   switch (MI->getOpcode()) {
 116   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 117   case AMDGPU::CLAMP_R600: {
 118     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 119                                                    AMDGPU::MOV,
 120                                                    MI->getOperand(0).getReg(),
 121                                                    MI->getOperand(1).getReg());
 122     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 123     break;
 124   }
 125
 126   case AMDGPU::FABS_R600: {
 127     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 128                                                     AMDGPU::MOV,
 129                                                     MI->getOperand(0).getReg(),
 130                                                     MI->getOperand(1).getReg());
 131     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 132     break;
 133   }
 134
 135   case AMDGPU::FNEG_R600: {
 136     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 137                                                     AMDGPU::MOV,
 138                                                     MI->getOperand(0).getReg(),
 139                                                     MI->getOperand(1).getReg());
 140     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 141     break;
 142   }
 143
 144   case AMDGPU::MASK_WRITE: {
 145     unsigned maskedRegister = MI->getOperand(0).getReg();
 146     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 147     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 148     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 149     break;
 150   }
 151
 152   case AMDGPU::LDS_READ_RET: {
 153     MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 154                                         TII->get(MI->getOpcode()),
 155                                         AMDGPU::OQAP);
 156     for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 157       NewMI.addOperand(MI->getOperand(i));
 158     }
 159     TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
 160                                  MI->getOperand(0).getReg(),
 161                                  AMDGPU::OQAP);
 162     break;
 163   }
 164
 165   case AMDGPU::MOV_IMM_F32:
 166     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 167                      MI->getOperand(1).getFPImm()->getValueAPF()
 168                          .bitcastToAPInt().getZExtValue());
 169     break;
 170   case AMDGPU::MOV_IMM_I32:
 171     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 172                      MI->getOperand(1).getImm());
 173     break;
 174   case AMDGPU::CONST_COPY: {
 175     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 176         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 177     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 178         MI->getOperand(1).getImm());
 179     break;
 180   }
 181
 182   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 183   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 184   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 185     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 186
 187     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 188             .addOperand(MI->getOperand(0))
 189             .addOperand(MI->getOperand(1))
 190             .addImm(EOP); // Set End of program bit
 191     break;
 192   }
 193
 194   case AMDGPU::TXD: {
 195     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 196     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 197     MachineOperand &RID = MI->getOperand(4);
 198     MachineOperand &SID = MI->getOperand(5);
 199     unsigned TextureId = MI->getOperand(6).getImm();
 200     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 201     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 202
 203     switch (TextureId) {
 204     case 5: // Rect
 205       CTX = CTY = 0;
 206       break;
 207     case 6: // Shadow1D
 208       SrcW = SrcZ;
 209       break;
 210     case 7: // Shadow2D
 211       SrcW = SrcZ;
 212       break;
 213     case 8: // ShadowRect
 214       CTX = CTY = 0;
 215       SrcW = SrcZ;
 216       break;
 217     case 9: // 1DArray
 218       SrcZ = SrcY;
 219       CTZ = 0;
 220       break;
 221     case 10: // 2DArray
 222       CTZ = 0;
 223       break;
 224     case 11: // Shadow1DArray
 225       SrcZ = SrcY;
 226       CTZ = 0;
 227       break;
 228     case 12: // Shadow2DArray
 229       CTZ = 0;
 230       break;
 231     }
 232     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 233             .addOperand(MI->getOperand(3))
 234             .addImm(SrcX)
 235             .addImm(SrcY)
 236             .addImm(SrcZ)
 237             .addImm(SrcW)
 238             .addImm(0)
 239             .addImm(0)
 240             .addImm(0)
 241             .addImm(0)
 242             .addImm(1)
 243             .addImm(2)
 244             .addImm(3)
 245             .addOperand(RID)
 246             .addOperand(SID)
 247             .addImm(CTX)
 248             .addImm(CTY)
 249             .addImm(CTZ)
 250             .addImm(CTW);
 251     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 252             .addOperand(MI->getOperand(2))
 253             .addImm(SrcX)
 254             .addImm(SrcY)
 255             .addImm(SrcZ)
 256             .addImm(SrcW)
 257             .addImm(0)
 258             .addImm(0)
 259             .addImm(0)
 260             .addImm(0)
 261             .addImm(1)
 262             .addImm(2)
 263             .addImm(3)
 264             .addOperand(RID)
 265             .addOperand(SID)
 266             .addImm(CTX)
 267             .addImm(CTY)
 268             .addImm(CTZ)
 269             .addImm(CTW);
 270     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 271             .addOperand(MI->getOperand(0))
 272             .addOperand(MI->getOperand(1))
 273             .addImm(SrcX)
 274             .addImm(SrcY)
 275             .addImm(SrcZ)
 276             .addImm(SrcW)
 277             .addImm(0)
 278             .addImm(0)
 279             .addImm(0)
 280             .addImm(0)
 281             .addImm(1)
 282             .addImm(2)
 283             .addImm(3)
 284             .addOperand(RID)
 285             .addOperand(SID)
 286             .addImm(CTX)
 287             .addImm(CTY)
 288             .addImm(CTZ)
 289             .addImm(CTW)
 290             .addReg(T0, RegState::Implicit)
 291             .addReg(T1, RegState::Implicit);
 292     break;
 293   }
 294
 295   case AMDGPU::TXD_SHADOW: {
 296     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 297     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 298     MachineOperand &RID = MI->getOperand(4);
 299     MachineOperand &SID = MI->getOperand(5);
 300     unsigned TextureId = MI->getOperand(6).getImm();
 301     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 302     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 303
 304     switch (TextureId) {
 305     case 5: // Rect
 306       CTX = CTY = 0;
 307       break;
 308     case 6: // Shadow1D
 309       SrcW = SrcZ;
 310       break;
 311     case 7: // Shadow2D
 312       SrcW = SrcZ;
 313       break;
 314     case 8: // ShadowRect
 315       CTX = CTY = 0;
 316       SrcW = SrcZ;
 317       break;
 318     case 9: // 1DArray
 319       SrcZ = SrcY;
 320       CTZ = 0;
 321       break;
 322     case 10: // 2DArray
 323       CTZ = 0;
 324       break;
 325     case 11: // Shadow1DArray
 326       SrcZ = SrcY;
 327       CTZ = 0;
 328       break;
 329     case 12: // Shadow2DArray
 330       CTZ = 0;
 331       break;
 332     }
 333
 334     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 335             .addOperand(MI->getOperand(3))
 336             .addImm(SrcX)
 337             .addImm(SrcY)
 338             .addImm(SrcZ)
 339             .addImm(SrcW)
 340             .addImm(0)
 341             .addImm(0)
 342             .addImm(0)
 343             .addImm(0)
 344             .addImm(1)
 345             .addImm(2)
 346             .addImm(3)
 347             .addOperand(RID)
 348             .addOperand(SID)
 349             .addImm(CTX)
 350             .addImm(CTY)
 351             .addImm(CTZ)
 352             .addImm(CTW);
 353     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 354             .addOperand(MI->getOperand(2))
 355             .addImm(SrcX)
 356             .addImm(SrcY)
 357             .addImm(SrcZ)
 358             .addImm(SrcW)
 359             .addImm(0)
 360             .addImm(0)
 361             .addImm(0)
 362             .addImm(0)
 363             .addImm(1)
 364             .addImm(2)
 365             .addImm(3)
 366             .addOperand(RID)
 367             .addOperand(SID)
 368             .addImm(CTX)
 369             .addImm(CTY)
 370             .addImm(CTZ)
 371             .addImm(CTW);
 372     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 373             .addOperand(MI->getOperand(0))
 374             .addOperand(MI->getOperand(1))
 375             .addImm(SrcX)
 376             .addImm(SrcY)
 377             .addImm(SrcZ)
 378             .addImm(SrcW)
 379             .addImm(0)
 380             .addImm(0)
 381             .addImm(0)
 382             .addImm(0)
 383             .addImm(1)
 384             .addImm(2)
 385             .addImm(3)
 386             .addOperand(RID)
 387             .addOperand(SID)
 388             .addImm(CTX)
 389             .addImm(CTY)
 390             .addImm(CTZ)
 391             .addImm(CTW)
 392             .addReg(T0, RegState::Implicit)
 393             .addReg(T1, RegState::Implicit);
 394     break;
 395   }
 396
 397   case AMDGPU::BRANCH:
 398       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 399               .addOperand(MI->getOperand(0));
 400       break;
 401
 402   case AMDGPU::BRANCH_COND_f32: {
 403     MachineInstr *NewMI =
 404       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 405               AMDGPU::PREDICATE_BIT)
 406               .addOperand(MI->getOperand(1))
 407               .addImm(OPCODE_IS_NOT_ZERO)
 408               .addImm(0); // Flags
 409     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 410     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 411             .addOperand(MI->getOperand(0))
 412             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 413     break;
 414   }
 415
 416   case AMDGPU::BRANCH_COND_i32: {
 417     MachineInstr *NewMI =
 418       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 419             AMDGPU::PREDICATE_BIT)
 420             .addOperand(MI->getOperand(1))
 421             .addImm(OPCODE_IS_NOT_ZERO_INT)
 422             .addImm(0); // Flags
 423     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 424     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 425            .addOperand(MI->getOperand(0))
 426             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 427     break;
 428   }
 429
 430   case AMDGPU::EG_ExportSwz:
 431   case AMDGPU::R600_ExportSwz: {
 432     // Instruction is left unmodified if its not the last one of its type
 433     bool isLastInstructionOfItsType = true;
 434     unsigned InstExportType = MI->getOperand(1).getImm();
 435     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 436          EndBlock = BB->end(); NextExportInst != EndBlock;
 437          NextExportInst = llvm::next(NextExportInst)) {
 438       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 439           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 440         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 441             .getImm();
 442         if (CurrentInstExportType == InstExportType) {
 443           isLastInstructionOfItsType = false;
 444           break;
 445         }
 446       }
 447     }
 448     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 449     if (!EOP && !isLastInstructionOfItsType)
 450       return BB;
 451     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 452     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 453             .addOperand(MI->getOperand(0))
 454             .addOperand(MI->getOperand(1))
 455             .addOperand(MI->getOperand(2))
 456             .addOperand(MI->getOperand(3))
 457             .addOperand(MI->getOperand(4))
 458             .addOperand(MI->getOperand(5))
 459             .addOperand(MI->getOperand(6))
 460             .addImm(CfInst)
 461             .addImm(EOP);
 462     break;
 463   }
 464   case AMDGPU::RETURN: {
 465     // RETURN instructions must have the live-out registers as implicit uses,
 466     // otherwise they appear dead.
 467     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 468     MachineInstrBuilder MIB(*MF, MI);
 469     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 470       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 471     return BB;
 472   }
 473   }
 474
 475   MI->eraseFromParent();
 476   return BB;
 477 }
 478
 479 //===----------------------------------------------------------------------===//
 480 // Custom DAG Lowering Operations
 481 //===----------------------------------------------------------------------===//
 482
 483 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 484   MachineFunction &MF = DAG.getMachineFunction();
 485   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 486   switch (Op.getOpcode()) {
 487   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 488   case ISD::FCOS:
 489   case ISD::FSIN: return LowerTrig(Op, DAG);
 490   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 491   case ISD::SELECT: return LowerSELECT(Op, DAG);
 492   case ISD::STORE: return LowerSTORE(Op, DAG);
 493   case ISD::LOAD: return LowerLOAD(Op, DAG);
 494   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 495   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 496   case ISD::INTRINSIC_VOID: {
 497     SDValue Chain = Op.getOperand(0);
 498     unsigned IntrinsicID =
 499                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 500     switch (IntrinsicID) {
 501     case AMDGPUIntrinsic::AMDGPU_store_output: {
 502       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 503       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 504       MFI->LiveOuts.push_back(Reg);
 505       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 506     }
 507     case AMDGPUIntrinsic::R600_store_swizzle: {
 508       const SDValue Args[8] = {
 509         Chain,
 510         Op.getOperand(2), // Export Value
 511         Op.getOperand(3), // ArrayBase
 512         Op.getOperand(4), // Type
 513         DAG.getConstant(0, MVT::i32), // SWZ_X
 514         DAG.getConstant(1, MVT::i32), // SWZ_Y
 515         DAG.getConstant(2, MVT::i32), // SWZ_Z
 516         DAG.getConstant(3, MVT::i32) // SWZ_W
 517       };
 518       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 519           Args, 8);
 520     }
 521
 522     // default for switch(IntrinsicID)
 523     default: break;
 524     }
 525     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 526     break;
 527   }
 528   case ISD::INTRINSIC_WO_CHAIN: {
 529     unsigned IntrinsicID =
 530                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 531     EVT VT = Op.getValueType();
 532     SDLoc DL(Op);
 533     switch(IntrinsicID) {
 534     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 535     case AMDGPUIntrinsic::R600_load_input: {
 536       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 537       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 538       MachineFunction &MF = DAG.getMachineFunction();
 539       MachineRegisterInfo &MRI = MF.getRegInfo();
 540       MRI.addLiveIn(Reg);
 541       return DAG.getCopyFromReg(DAG.getEntryNode(),
 542           SDLoc(DAG.getEntryNode()), Reg, VT);
 543     }
 544
 545     case AMDGPUIntrinsic::R600_interp_input: {
 546       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 547       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 548       MachineSDNode *interp;
 549       if (ijb < 0) {
 550         const MachineFunction &MF = DAG.getMachineFunction();
 551         const R600InstrInfo *TII =
 552           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 553         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 554             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 555         return DAG.getTargetExtractSubreg(
 556             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 557             DL, MVT::f32, SDValue(interp, 0));
 558       }
 559
 560       MachineFunction &MF = DAG.getMachineFunction();
 561       MachineRegisterInfo &MRI = MF.getRegInfo();
 562       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 563       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 564       MRI.addLiveIn(RegisterI);
 565       MRI.addLiveIn(RegisterJ);
 566       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 567           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 568       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 569           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 570
 571       if (slot % 4 < 2)
 572         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 573             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 574             RegisterJNode, RegisterINode);
 575       else
 576         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 577             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 578             RegisterJNode, RegisterINode);
 579       return SDValue(interp, slot % 2);
 580     }
 581     case AMDGPUIntrinsic::R600_tex:
 582     case AMDGPUIntrinsic::R600_texc:
 583     case AMDGPUIntrinsic::R600_txl:
 584     case AMDGPUIntrinsic::R600_txlc:
 585     case AMDGPUIntrinsic::R600_txb:
 586     case AMDGPUIntrinsic::R600_txbc:
 587     case AMDGPUIntrinsic::R600_txf:
 588     case AMDGPUIntrinsic::R600_txq:
 589     case AMDGPUIntrinsic::R600_ddx:
 590     case AMDGPUIntrinsic::R600_ddy: {
 591       unsigned TextureOp;
 592       switch (IntrinsicID) {
 593       case AMDGPUIntrinsic::R600_tex:
 594         TextureOp = 0;
 595         break;
 596       case AMDGPUIntrinsic::R600_texc:
 597         TextureOp = 1;
 598         break;
 599       case AMDGPUIntrinsic::R600_txl:
 600         TextureOp = 2;
 601         break;
 602       case AMDGPUIntrinsic::R600_txlc:
 603         TextureOp = 3;
 604         break;
 605       case AMDGPUIntrinsic::R600_txb:
 606         TextureOp = 4;
 607         break;
 608       case AMDGPUIntrinsic::R600_txbc:
 609         TextureOp = 5;
 610         break;
 611       case AMDGPUIntrinsic::R600_txf:
 612         TextureOp = 6;
 613         break;
 614       case AMDGPUIntrinsic::R600_txq:
 615         TextureOp = 7;
 616         break;
 617       case AMDGPUIntrinsic::R600_ddx:
 618         TextureOp = 8;
 619         break;
 620       case AMDGPUIntrinsic::R600_ddy:
 621         TextureOp = 9;
 622         break;
 623       default:
 624         llvm_unreachable("Unknow Texture Operation");
 625       }
 626
 627       SDValue TexArgs[19] = {
 628         DAG.getConstant(TextureOp, MVT::i32),
 629         Op.getOperand(1),
 630         DAG.getConstant(0, MVT::i32),
 631         DAG.getConstant(1, MVT::i32),
 632         DAG.getConstant(2, MVT::i32),
 633         DAG.getConstant(3, MVT::i32),
 634         Op.getOperand(2),
 635         Op.getOperand(3),
 636         Op.getOperand(4),
 637         DAG.getConstant(0, MVT::i32),
 638         DAG.getConstant(1, MVT::i32),
 639         DAG.getConstant(2, MVT::i32),
 640         DAG.getConstant(3, MVT::i32),
 641         Op.getOperand(5),
 642         Op.getOperand(6),
 643         Op.getOperand(7),
 644         Op.getOperand(8),
 645         Op.getOperand(9),
 646         Op.getOperand(10)
 647       };
 648       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 649     }
 650     case AMDGPUIntrinsic::AMDGPU_dp4: {
 651       SDValue Args[8] = {
 652       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 653           DAG.getConstant(0, MVT::i32)),
 654       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 655           DAG.getConstant(0, MVT::i32)),
 656       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 657           DAG.getConstant(1, MVT::i32)),
 658       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 659           DAG.getConstant(1, MVT::i32)),
 660       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 661           DAG.getConstant(2, MVT::i32)),
 662       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 663           DAG.getConstant(2, MVT::i32)),
 664       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 665           DAG.getConstant(3, MVT::i32)),
 666       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 667           DAG.getConstant(3, MVT::i32))
 668       };
 669       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 670     }
 671
 672     case Intrinsic::r600_read_ngroups_x:
 673       return LowerImplicitParameter(DAG, VT, DL, 0);
 674     case Intrinsic::r600_read_ngroups_y:
 675       return LowerImplicitParameter(DAG, VT, DL, 1);
 676     case Intrinsic::r600_read_ngroups_z:
 677       return LowerImplicitParameter(DAG, VT, DL, 2);
 678     case Intrinsic::r600_read_global_size_x:
 679       return LowerImplicitParameter(DAG, VT, DL, 3);
 680     case Intrinsic::r600_read_global_size_y:
 681       return LowerImplicitParameter(DAG, VT, DL, 4);
 682     case Intrinsic::r600_read_global_size_z:
 683       return LowerImplicitParameter(DAG, VT, DL, 5);
 684     case Intrinsic::r600_read_local_size_x:
 685       return LowerImplicitParameter(DAG, VT, DL, 6);
 686     case Intrinsic::r600_read_local_size_y:
 687       return LowerImplicitParameter(DAG, VT, DL, 7);
 688     case Intrinsic::r600_read_local_size_z:
 689       return LowerImplicitParameter(DAG, VT, DL, 8);
 690
 691     case Intrinsic::r600_read_tgid_x:
 692       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 693                                   AMDGPU::T1_X, VT);
 694     case Intrinsic::r600_read_tgid_y:
 695       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 696                                   AMDGPU::T1_Y, VT);
 697     case Intrinsic::r600_read_tgid_z:
 698       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 699                                   AMDGPU::T1_Z, VT);
 700     case Intrinsic::r600_read_tidig_x:
 701       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 702                                   AMDGPU::T0_X, VT);
 703     case Intrinsic::r600_read_tidig_y:
 704       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 705                                   AMDGPU::T0_Y, VT);
 706     case Intrinsic::r600_read_tidig_z:
 707       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 708                                   AMDGPU::T0_Z, VT);
 709     }
 710     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 711     break;
 712   }
 713   } // end switch(Op.getOpcode())
 714   return SDValue();
 715 }
 716
 717 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 718                                             SmallVectorImpl<SDValue> &Results,
 719                                             SelectionDAG &DAG) const {
 720   switch (N->getOpcode()) {
 721   default: return;
 722   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 723     return;
 724   case ISD::LOAD: {
 725     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 726     Results.push_back(SDValue(Node, 0));
 727     Results.push_back(SDValue(Node, 1));
 728     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 729     // function
 730     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 731     return;
 732   }
 733   case ISD::STORE:
 734     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 735     Results.push_back(SDValue(Node, 0));
 736     return;
 737   }
 738 }
 739
 740 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 741   // On hw >= R700, COS/SIN input must be between -1. and 1.
 742   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 743   EVT VT = Op.getValueType();
 744   SDValue Arg = Op.getOperand(0);
 745   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 746       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 747         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 748           DAG.getConstantFP(0.15915494309, MVT::f32)),
 749         DAG.getConstantFP(0.5, MVT::f32)));
 750   unsigned TrigNode;
 751   switch (Op.getOpcode()) {
 752   case ISD::FCOS:
 753     TrigNode = AMDGPUISD::COS_HW;
 754     break;
 755   case ISD::FSIN:
 756     TrigNode = AMDGPUISD::SIN_HW;
 757     break;
 758   default:
 759     llvm_unreachable("Wrong trig opcode");
 760   }
 761   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 762       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 763         DAG.getConstantFP(-0.5, MVT::f32)));
 764   if (Gen >= AMDGPUSubtarget::R700)
 765     return TrigVal;
 766   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 767   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 768       DAG.getConstantFP(3.14159265359, MVT::f32));
 769 }
 770
 771 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 772   return DAG.getNode(
 773       ISD::SETCC,
 774       SDLoc(Op),
 775       MVT::i1,
 776       Op, DAG.getConstantFP(0.0f, MVT::f32),
 777       DAG.getCondCode(ISD::SETNE)
 778       );
 779 }
 780
 781 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 782                                                    SDLoc DL,
 783                                                    unsigned DwordOffset) const {
 784   unsigned ByteOffset = DwordOffset * 4;
 785   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 786                                       AMDGPUAS::CONSTANT_BUFFER_0);
 787
 788   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 789   assert(isInt<16>(ByteOffset));
 790
 791   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 792                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 793                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 794                      false, false, false, 0);
 795 }
 796
 797 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 798
 799   MachineFunction &MF = DAG.getMachineFunction();
 800   const AMDGPUFrameLowering *TFL =
 801    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 802
 803   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 804   assert(FIN);
 805
 806   unsigned FrameIndex = FIN->getIndex();
 807   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 808   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 809 }
 810
 811 bool R600TargetLowering::isZero(SDValue Op) const {
 812   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 813     return Cst->isNullValue();
 814   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 815     return CstFP->isZero();
 816   } else {
 817     return false;
 818   }
 819 }
 820
 821 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 822   SDLoc DL(Op);
 823   EVT VT = Op.getValueType();
 824
 825   SDValue LHS = Op.getOperand(0);
 826   SDValue RHS = Op.getOperand(1);
 827   SDValue True = Op.getOperand(2);
 828   SDValue False = Op.getOperand(3);
 829   SDValue CC = Op.getOperand(4);
 830   SDValue Temp;
 831
 832   // LHS and RHS are guaranteed to be the same value type
 833   EVT CompareVT = LHS.getValueType();
 834
 835   // Check if we can lower this to a native operation.
 836
 837   // Try to lower to a SET* instruction:
 838   //
 839   // SET* can match the following patterns:
 840   //
 841   // select_cc f32, f32, -1,  0, cc_any
 842   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 843   // select_cc i32, i32, -1,  0, cc_any
 844   //
 845
 846   // Move hardware True/False values to the correct operand.
 847   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 848     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 849     std::swap(False, True);
 850     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 851   }
 852
 853   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 854       (CompareVT == VT || VT == MVT::i32)) {
 855     // This can be matched by a SET* instruction.
 856     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 857   }
 858
 859   // Try to lower to a CND* instruction:
 860   //
 861   // CND* can match the following patterns:
 862   //
 863   // select_cc f32, 0.0, f32, f32, cc_any
 864   // select_cc f32, 0.0, i32, i32, cc_any
 865   // select_cc i32, 0,   f32, f32, cc_any
 866   // select_cc i32, 0,   i32, i32, cc_any
 867   //
 868   if (isZero(LHS) || isZero(RHS)) {
 869     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 870     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 871     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 872     if (CompareVT != VT) {
 873       // Bitcast True / False to the correct types.  This will end up being
 874       // a nop, but it allows us to define only a single pattern in the
 875       // .TD files for each CND* instruction rather than having to have
 876       // one pattern for integer True/False and one for fp True/False
 877       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 878       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 879     }
 880     if (isZero(LHS)) {
 881       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 882     }
 883
 884     switch (CCOpcode) {
 885     case ISD::SETONE:
 886     case ISD::SETUNE:
 887     case ISD::SETNE:
 888     case ISD::SETULE:
 889     case ISD::SETULT:
 890     case ISD::SETOLE:
 891     case ISD::SETOLT:
 892     case ISD::SETLE:
 893     case ISD::SETLT:
 894       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 895       Temp = True;
 896       True = False;
 897       False = Temp;
 898       break;
 899     default:
 900       break;
 901     }
 902     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 903         Cond, Zero,
 904         True, False,
 905         DAG.getCondCode(CCOpcode));
 906     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 907   }
 908
 909
 910   // Possible Min/Max pattern
 911   SDValue MinMax = LowerMinMax(Op, DAG);
 912   if (MinMax.getNode()) {
 913     return MinMax;
 914   }
 915
 916   // If we make it this for it means we have no native instructions to handle
 917   // this SELECT_CC, so we must lower it.
 918   SDValue HWTrue, HWFalse;
 919
 920   if (CompareVT == MVT::f32) {
 921     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 922     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 923   } else if (CompareVT == MVT::i32) {
 924     HWTrue = DAG.getConstant(-1, CompareVT);
 925     HWFalse = DAG.getConstant(0, CompareVT);
 926   }
 927   else {
 928     assert(!"Unhandled value type in LowerSELECT_CC");
 929   }
 930
 931   // Lower this unsupported SELECT_CC into a combination of two supported
 932   // SELECT_CC operations.
 933   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 934
 935   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 936       Cond, HWFalse,
 937       True, False,
 938       DAG.getCondCode(ISD::SETNE));
 939 }
 940
 941 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 942   return DAG.getNode(ISD::SELECT_CC,
 943       SDLoc(Op),
 944       Op.getValueType(),
 945       Op.getOperand(0),
 946       DAG.getConstant(0, MVT::i32),
 947       Op.getOperand(1),
 948       Op.getOperand(2),
 949       DAG.getCondCode(ISD::SETNE));
 950 }
 951
 952 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 953 /// convert these pointers to a register index.  Each register holds
 954 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 955 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 956 /// for indirect addressing.
 957 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 958                                                unsigned StackWidth,
 959                                                SelectionDAG &DAG) const {
 960   unsigned SRLPad;
 961   switch(StackWidth) {
 962   case 1:
 963     SRLPad = 2;
 964     break;
 965   case 2:
 966     SRLPad = 3;
 967     break;
 968   case 4:
 969     SRLPad = 4;
 970     break;
 971   default: llvm_unreachable("Invalid stack width");
 972   }
 973
 974   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
 975                      DAG.getConstant(SRLPad, MVT::i32));
 976 }
 977
 978 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 979                                          unsigned ElemIdx,
 980                                          unsigned &Channel,
 981                                          unsigned &PtrIncr) const {
 982   switch (StackWidth) {
 983   default:
 984   case 1:
 985     Channel = 0;
 986     if (ElemIdx > 0) {
 987       PtrIncr = 1;
 988     } else {
 989       PtrIncr = 0;
 990     }
 991     break;
 992   case 2:
 993     Channel = ElemIdx % 2;
 994     if (ElemIdx == 2) {
 995       PtrIncr = 1;
 996     } else {
 997       PtrIncr = 0;
 998     }
 999     break;
1000   case 4:
1001     Channel = ElemIdx;
1002     PtrIncr = 0;
1003     break;
1004   }
1005 }
1006
1007 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1008   SDLoc DL(Op);
1009   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1010   SDValue Chain = Op.getOperand(0);
1011   SDValue Value = Op.getOperand(1);
1012   SDValue Ptr = Op.getOperand(2);
1013
1014   SDValue Result = AMDGPUTargetLowering::LowerVectorStore(Op, DAG);
1015   if (Result.getNode()) {
1016     return Result;
1017   }
1018
1019   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1020     if (StoreNode->isTruncatingStore()) {
1021       EVT VT = Value.getValueType();
1022       assert(VT.bitsLE(MVT::i32));
1023       EVT MemVT = StoreNode->getMemoryVT();
1024       SDValue MaskConstant;
1025       if (MemVT == MVT::i8) {
1026         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1027       } else {
1028         assert(MemVT == MVT::i16);
1029         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1030       }
1031       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1032                                       DAG.getConstant(2, MVT::i32));
1033       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1034                                       DAG.getConstant(0x00000003, VT));
1035       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1036       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1037                                    DAG.getConstant(3, VT));
1038       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1039       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1040       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1041       // vector instead.
1042       SDValue Src[4] = {
1043         ShiftedValue,
1044         DAG.getConstant(0, MVT::i32),
1045         DAG.getConstant(0, MVT::i32),
1046         Mask
1047       };
1048       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1049       SDValue Args[3] = { Chain, Input, DWordAddr };
1050       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1051                                      Op->getVTList(), Args, 3, MemVT,
1052                                      StoreNode->getMemOperand());
1053     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1054                Value.getValueType().bitsGE(MVT::i32)) {
1055       // Convert pointer from byte address to dword address.
1056       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1057                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1058                                     Ptr, DAG.getConstant(2, MVT::i32)));
1059
1060       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1061         assert(!"Truncated and indexed stores not supported yet");
1062       } else {
1063         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1064       }
1065       return Chain;
1066     }
1067   }
1068
1069   EVT ValueVT = Value.getValueType();
1070
1071   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1072     return SDValue();
1073   }
1074
1075   // Lowering for indirect addressing
1076
1077   const MachineFunction &MF = DAG.getMachineFunction();
1078   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1079                                          getTargetMachine().getFrameLowering());
1080   unsigned StackWidth = TFL->getStackWidth(MF);
1081
1082   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1083
1084   if (ValueVT.isVector()) {
1085     unsigned NumElemVT = ValueVT.getVectorNumElements();
1086     EVT ElemVT = ValueVT.getVectorElementType();
1087     SDValue Stores[4];
1088
1089     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1090                                       "vector width in load");
1091
1092     for (unsigned i = 0; i < NumElemVT; ++i) {
1093       unsigned Channel, PtrIncr;
1094       getStackAddress(StackWidth, i, Channel, PtrIncr);
1095       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1096                         DAG.getConstant(PtrIncr, MVT::i32));
1097       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1098                                  Value, DAG.getConstant(i, MVT::i32));
1099
1100       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1101                               Chain, Elem, Ptr,
1102                               DAG.getTargetConstant(Channel, MVT::i32));
1103     }
1104      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1105    } else {
1106     if (ValueVT == MVT::i8) {
1107       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1108     }
1109     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1110     DAG.getTargetConstant(0, MVT::i32)); // Channel
1111   }
1112
1113   return Chain;
1114 }
1115
1116 // return (512 + (kc_bank << 12)
1117 static int
1118 ConstantAddressBlock(unsigned AddressSpace) {
1119   switch (AddressSpace) {
1120   case AMDGPUAS::CONSTANT_BUFFER_0:
1121     return 512;
1122   case AMDGPUAS::CONSTANT_BUFFER_1:
1123     return 512 + 4096;
1124   case AMDGPUAS::CONSTANT_BUFFER_2:
1125     return 512 + 4096 * 2;
1126   case AMDGPUAS::CONSTANT_BUFFER_3:
1127     return 512 + 4096 * 3;
1128   case AMDGPUAS::CONSTANT_BUFFER_4:
1129     return 512 + 4096 * 4;
1130   case AMDGPUAS::CONSTANT_BUFFER_5:
1131     return 512 + 4096 * 5;
1132   case AMDGPUAS::CONSTANT_BUFFER_6:
1133     return 512 + 4096 * 6;
1134   case AMDGPUAS::CONSTANT_BUFFER_7:
1135     return 512 + 4096 * 7;
1136   case AMDGPUAS::CONSTANT_BUFFER_8:
1137     return 512 + 4096 * 8;
1138   case AMDGPUAS::CONSTANT_BUFFER_9:
1139     return 512 + 4096 * 9;
1140   case AMDGPUAS::CONSTANT_BUFFER_10:
1141     return 512 + 4096 * 10;
1142   case AMDGPUAS::CONSTANT_BUFFER_11:
1143     return 512 + 4096 * 11;
1144   case AMDGPUAS::CONSTANT_BUFFER_12:
1145     return 512 + 4096 * 12;
1146   case AMDGPUAS::CONSTANT_BUFFER_13:
1147     return 512 + 4096 * 13;
1148   case AMDGPUAS::CONSTANT_BUFFER_14:
1149     return 512 + 4096 * 14;
1150   case AMDGPUAS::CONSTANT_BUFFER_15:
1151     return 512 + 4096 * 15;
1152   default:
1153     return -1;
1154   }
1155 }
1156
1157 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1158 {
1159   EVT VT = Op.getValueType();
1160   SDLoc DL(Op);
1161   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1162   SDValue Chain = Op.getOperand(0);
1163   SDValue Ptr = Op.getOperand(1);
1164   SDValue LoweredLoad;
1165
1166   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1167   if (ConstantBlock > -1) {
1168     SDValue Result;
1169     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1170         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1171         dyn_cast<ConstantSDNode>(Ptr)) {
1172       SDValue Slots[4];
1173       for (unsigned i = 0; i < 4; i++) {
1174         // We want Const position encoded with the following formula :
1175         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1176         // const_index is Ptr computed by llvm using an alignment of 16.
1177         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1178         // then div by 4 at the ISel step
1179         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1180             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1181         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1182       }
1183       EVT NewVT = MVT::v4i32;
1184       unsigned NumElements = 4;
1185       if (VT.isVector()) {
1186         NewVT = VT;
1187         NumElements = VT.getVectorNumElements();
1188       }
1189       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1190     } else {
1191       // non constant ptr cant be folded, keeps it as a v4f32 load
1192       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1193           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1194           DAG.getConstant(LoadNode->getAddressSpace() -
1195                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1196           );
1197     }
1198
1199     if (!VT.isVector()) {
1200       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1201           DAG.getConstant(0, MVT::i32));
1202     }
1203
1204     SDValue MergedValues[2] = {
1205         Result,
1206         Chain
1207     };
1208     return DAG.getMergeValues(MergedValues, 2, DL);
1209   }
1210
1211   // For most operations returning SDValue() will result int he node being
1212   // expanded by the DAG Legalizer.  This is not the case for ISD::LOAD, so
1213   // we need to manually expand loads that may be legal in some address spaces
1214   // and illegal in others.  SEXT loads from CONSTANT_BUFFER_0 are supported
1215   // for compute shaders, since the data is sign extended when it is uploaded
1216   // to the buffer.  Howerver SEXT loads from other addresspaces are not
1217   // supported, so we need to expand them here.
1218   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1219     EVT MemVT = LoadNode->getMemoryVT();
1220     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1221     SDValue ShiftAmount =
1222           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1223     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1224                                   LoadNode->getPointerInfo(), MemVT,
1225                                   LoadNode->isVolatile(),
1226                                   LoadNode->isNonTemporal(),
1227                                   LoadNode->getAlignment());
1228     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1229     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1230
1231     SDValue MergedValues[2] = { Sra, Chain };
1232     return DAG.getMergeValues(MergedValues, 2, DL);
1233   }
1234
1235   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1236     return SDValue();
1237   }
1238
1239   // Lowering for indirect addressing
1240   const MachineFunction &MF = DAG.getMachineFunction();
1241   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1242                                          getTargetMachine().getFrameLowering());
1243   unsigned StackWidth = TFL->getStackWidth(MF);
1244
1245   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1246
1247   if (VT.isVector()) {
1248     unsigned NumElemVT = VT.getVectorNumElements();
1249     EVT ElemVT = VT.getVectorElementType();
1250     SDValue Loads[4];
1251
1252     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1253                                       "vector width in load");
1254
1255     for (unsigned i = 0; i < NumElemVT; ++i) {
1256       unsigned Channel, PtrIncr;
1257       getStackAddress(StackWidth, i, Channel, PtrIncr);
1258       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1259                         DAG.getConstant(PtrIncr, MVT::i32));
1260       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1261                              Chain, Ptr,
1262                              DAG.getTargetConstant(Channel, MVT::i32),
1263                              Op.getOperand(2));
1264     }
1265     for (unsigned i = NumElemVT; i < 4; ++i) {
1266       Loads[i] = DAG.getUNDEF(ElemVT);
1267     }
1268     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1269     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1270   } else {
1271     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1272                               Chain, Ptr,
1273                               DAG.getTargetConstant(0, MVT::i32), // Channel
1274                               Op.getOperand(2));
1275   }
1276
1277   SDValue Ops[2];
1278   Ops[0] = LoweredLoad;
1279   Ops[1] = Chain;
1280
1281   return DAG.getMergeValues(Ops, 2, DL);
1282 }
1283
1284 /// XXX Only kernel functions are supported, so we can assume for now that
1285 /// every function is a kernel function, but in the future we should use
1286 /// separate calling conventions for kernel and non-kernel functions.
1287 SDValue R600TargetLowering::LowerFormalArguments(
1288                                       SDValue Chain,
1289                                       CallingConv::ID CallConv,
1290                                       bool isVarArg,
1291                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1292                                       SDLoc DL, SelectionDAG &DAG,
1293                                       SmallVectorImpl<SDValue> &InVals) const {
1294   SmallVector<CCValAssign, 16> ArgLocs;
1295   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1296                  getTargetMachine(), ArgLocs, *DAG.getContext());
1297
1298   AnalyzeFormalArguments(CCInfo, Ins);
1299
1300   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1301     CCValAssign &VA = ArgLocs[i];
1302     EVT VT = VA.getLocVT();
1303
1304     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1305                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1306
1307     // The first 36 bytes of the input buffer contains information about
1308     // thread group and global sizes.
1309     SDValue Arg = DAG.getLoad(VT, DL, Chain,
1310                            DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1311                            MachinePointerInfo(UndefValue::get(PtrTy)), false,
1312                            false, false, 4); // 4 is the prefered alignment for
1313                                              // the CONSTANT memory space.
1314     InVals.push_back(Arg);
1315   }
1316   return Chain;
1317 }
1318
1319 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1320    if (!VT.isVector()) return MVT::i32;
1321    return VT.changeVectorElementTypeToInteger();
1322 }
1323
1324 static SDValue
1325 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1326                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1327   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1328   assert(RemapSwizzle.empty());
1329   SDValue NewBldVec[4] = {
1330       VectorEntry.getOperand(0),
1331       VectorEntry.getOperand(1),
1332       VectorEntry.getOperand(2),
1333       VectorEntry.getOperand(3)
1334   };
1335
1336   for (unsigned i = 0; i < 4; i++) {
1337     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1338       if (C->isZero()) {
1339         RemapSwizzle[i] = 4; // SEL_0
1340         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1341       } else if (C->isExactlyValue(1.0)) {
1342         RemapSwizzle[i] = 5; // SEL_1
1343         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1344       }
1345     }
1346
1347     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1348       continue;
1349     for (unsigned j = 0; j < i; j++) {
1350       if (NewBldVec[i] == NewBldVec[j]) {
1351         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1352         RemapSwizzle[i] = j;
1353         break;
1354       }
1355     }
1356   }
1357
1358   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1359       VectorEntry.getValueType(), NewBldVec, 4);
1360 }
1361
1362 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1363                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1364   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1365   assert(RemapSwizzle.empty());
1366   SDValue NewBldVec[4] = {
1367       VectorEntry.getOperand(0),
1368       VectorEntry.getOperand(1),
1369       VectorEntry.getOperand(2),
1370       VectorEntry.getOperand(3)
1371   };
1372   bool isUnmovable[4] = { false, false, false, false };
1373   for (unsigned i = 0; i < 4; i++)
1374     RemapSwizzle[i] = i;
1375
1376   for (unsigned i = 0; i < 4; i++) {
1377     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1378       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1379           ->getZExtValue();
1380       if (!isUnmovable[Idx]) {
1381         // Swap i and Idx
1382         std::swap(NewBldVec[Idx], NewBldVec[i]);
1383         std::swap(RemapSwizzle[RemapSwizzle[Idx]], RemapSwizzle[RemapSwizzle[i]]);
1384       }
1385       isUnmovable[Idx] = true;
1386     }
1387   }
1388
1389   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1390       VectorEntry.getValueType(), NewBldVec, 4);
1391 }
1392
1393
1394 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1395 SDValue Swz[4], SelectionDAG &DAG) const {
1396   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1397   // Old -> New swizzle values
1398   DenseMap<unsigned, unsigned> SwizzleRemap;
1399
1400   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1401   for (unsigned i = 0; i < 4; i++) {
1402     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1403     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1404       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1405   }
1406
1407   SwizzleRemap.clear();
1408   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1409   for (unsigned i = 0; i < 4; i++) {
1410     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1411     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1412       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1413   }
1414
1415   return BuildVector;
1416 }
1417
1418
1419 //===----------------------------------------------------------------------===//
1420 // Custom DAG Optimizations
1421 //===----------------------------------------------------------------------===//
1422
1423 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1424                                               DAGCombinerInfo &DCI) const {
1425   SelectionDAG &DAG = DCI.DAG;
1426
1427   switch (N->getOpcode()) {
1428   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1429   case ISD::FP_ROUND: {
1430       SDValue Arg = N->getOperand(0);
1431       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1432         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1433                            Arg.getOperand(0));
1434       }
1435       break;
1436     }
1437
1438   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1439   // (i32 select_cc f32, f32, -1, 0 cc)
1440   //
1441   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1442   // this to one of the SET*_DX10 instructions.
1443   case ISD::FP_TO_SINT: {
1444     SDValue FNeg = N->getOperand(0);
1445     if (FNeg.getOpcode() != ISD::FNEG) {
1446       return SDValue();
1447     }
1448     SDValue SelectCC = FNeg.getOperand(0);
1449     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1450         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1451         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1452         !isHWTrueValue(SelectCC.getOperand(2)) ||
1453         !isHWFalseValue(SelectCC.getOperand(3))) {
1454       return SDValue();
1455     }
1456
1457     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1458                            SelectCC.getOperand(0), // LHS
1459                            SelectCC.getOperand(1), // RHS
1460                            DAG.getConstant(-1, MVT::i32), // True
1461                            DAG.getConstant(0, MVT::i32),  // Flase
1462                            SelectCC.getOperand(4)); // CC
1463
1464     break;
1465   }
1466
1467   // insert_vector_elt (build_vector elt0, …, eltN), NewEltIdx, idx
1468   // => build_vector elt0, …, NewEltIdx, …, eltN
1469   case ISD::INSERT_VECTOR_ELT: {
1470     SDValue InVec = N->getOperand(0);
1471     SDValue InVal = N->getOperand(1);
1472     SDValue EltNo = N->getOperand(2);
1473     SDLoc dl(N);
1474
1475     // If the inserted element is an UNDEF, just use the input vector.
1476     if (InVal.getOpcode() == ISD::UNDEF)
1477       return InVec;
1478
1479     EVT VT = InVec.getValueType();
1480
1481     // If we can't generate a legal BUILD_VECTOR, exit
1482     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1483       return SDValue();
1484
1485     // Check that we know which element is being inserted
1486     if (!isa<ConstantSDNode>(EltNo))
1487       return SDValue();
1488     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1489
1490     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1491     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1492     // vector elements.
1493     SmallVector<SDValue, 8> Ops;
1494     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1495       Ops.append(InVec.getNode()->op_begin(),
1496                  InVec.getNode()->op_end());
1497     } else if (InVec.getOpcode() == ISD::UNDEF) {
1498       unsigned NElts = VT.getVectorNumElements();
1499       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1500     } else {
1501       return SDValue();
1502     }
1503
1504     // Insert the element
1505     if (Elt < Ops.size()) {
1506       // All the operands of BUILD_VECTOR must have the same type;
1507       // we enforce that here.
1508       EVT OpVT = Ops[0].getValueType();
1509       if (InVal.getValueType() != OpVT)
1510         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1511           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1512           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1513       Ops[Elt] = InVal;
1514     }
1515
1516     // Return the new vector
1517     return DAG.getNode(ISD::BUILD_VECTOR, dl,
1518                        VT, &Ops[0], Ops.size());
1519   }
1520
1521   // Extract_vec (Build_vector) generated by custom lowering
1522   // also needs to be customly combined
1523   case ISD::EXTRACT_VECTOR_ELT: {
1524     SDValue Arg = N->getOperand(0);
1525     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1526       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1527         unsigned Element = Const->getZExtValue();
1528         return Arg->getOperand(Element);
1529       }
1530     }
1531     if (Arg.getOpcode() == ISD::BITCAST &&
1532         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1533       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1534         unsigned Element = Const->getZExtValue();
1535         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1536             Arg->getOperand(0).getOperand(Element));
1537       }
1538     }
1539   }
1540
1541   case ISD::SELECT_CC: {
1542     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1543     //      selectcc x, y, a, b, inv(cc)
1544     //
1545     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1546     //      selectcc x, y, a, b, cc
1547     SDValue LHS = N->getOperand(0);
1548     if (LHS.getOpcode() != ISD::SELECT_CC) {
1549       return SDValue();
1550     }
1551
1552     SDValue RHS = N->getOperand(1);
1553     SDValue True = N->getOperand(2);
1554     SDValue False = N->getOperand(3);
1555     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1556
1557     if (LHS.getOperand(2).getNode() != True.getNode() ||
1558         LHS.getOperand(3).getNode() != False.getNode() ||
1559         RHS.getNode() != False.getNode()) {
1560       return SDValue();
1561     }
1562
1563     switch (NCC) {
1564     default: return SDValue();
1565     case ISD::SETNE: return LHS;
1566     case ISD::SETEQ: {
1567       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1568       LHSCC = ISD::getSetCCInverse(LHSCC,
1569                                   LHS.getOperand(0).getValueType().isInteger());
1570       return DAG.getSelectCC(SDLoc(N),
1571                              LHS.getOperand(0),
1572                              LHS.getOperand(1),
1573                              LHS.getOperand(2),
1574                              LHS.getOperand(3),
1575                              LHSCC);
1576     }
1577     }
1578   }
1579
1580   case AMDGPUISD::EXPORT: {
1581     SDValue Arg = N->getOperand(1);
1582     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1583       break;
1584
1585     SDValue NewArgs[8] = {
1586       N->getOperand(0), // Chain
1587       SDValue(),
1588       N->getOperand(2), // ArrayBase
1589       N->getOperand(3), // Type
1590       N->getOperand(4), // SWZ_X
1591       N->getOperand(5), // SWZ_Y
1592       N->getOperand(6), // SWZ_Z
1593       N->getOperand(7) // SWZ_W
1594     };
1595     SDLoc DL(N);
1596     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1597     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1598   }
1599   case AMDGPUISD::TEXTURE_FETCH: {
1600     SDValue Arg = N->getOperand(1);
1601     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1602       break;
1603
1604     SDValue NewArgs[19] = {
1605       N->getOperand(0),
1606       N->getOperand(1),
1607       N->getOperand(2),
1608       N->getOperand(3),
1609       N->getOperand(4),
1610       N->getOperand(5),
1611       N->getOperand(6),
1612       N->getOperand(7),
1613       N->getOperand(8),
1614       N->getOperand(9),
1615       N->getOperand(10),
1616       N->getOperand(11),
1617       N->getOperand(12),
1618       N->getOperand(13),
1619       N->getOperand(14),
1620       N->getOperand(15),
1621       N->getOperand(16),
1622       N->getOperand(17),
1623       N->getOperand(18),
1624     };
1625     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1626     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1627         NewArgs, 19);
1628   }
1629   }
1630   return SDValue();
1631 }