lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/CallingConvLower.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24 #include "llvm/IR/Argument.h"
  25 #include "llvm/IR/Function.h"
  26
  27 using namespace llvm;
  28
  29 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  30     AMDGPUTargetLowering(TM),
  31     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  36   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  37   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  38
  39   computeRegisterProperties();
  40
  41   // Set condition code actions
  42   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  43   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  44   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  45   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  46   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  54
  55   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  56   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  58   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  59
  60   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  61   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  62
  63   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  64   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  65
  66   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  67   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  68
  69   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  70
  71   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  72   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  73   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  74
  75   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  76   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  77
  78   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  79   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  80   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  81
  82   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  83   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  84   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  85   setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
  86   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  87   setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
  88
  89   // Legalize loads and stores to the private address space.
  90   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  91   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  92   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  93
  94   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
  95   // spaces, so it is custom lowered to handle those where it isn't.
  96   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
  97   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
  98   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  99   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
 100   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
 101   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
 102
 103   setOperationAction(ISD::STORE, MVT::i8, Custom);
 104   setOperationAction(ISD::STORE, MVT::i32, Custom);
 105   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 106   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 107   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 108   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 109
 110   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 111   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 112   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 113
 114   setTargetDAGCombine(ISD::FP_ROUND);
 115   setTargetDAGCombine(ISD::FP_TO_SINT);
 116   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 117   setTargetDAGCombine(ISD::SELECT_CC);
 118   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 119
 120   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 121
 122   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 123   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 124   setSchedulingPreference(Sched::Source);
 125 }
 126
 127 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 128     MachineInstr * MI, MachineBasicBlock * BB) const {
 129   MachineFunction * MF = BB->getParent();
 130   MachineRegisterInfo &MRI = MF->getRegInfo();
 131   MachineBasicBlock::iterator I = *MI;
 132   const R600InstrInfo *TII =
 133     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 134
 135   switch (MI->getOpcode()) {
 136   default:
 137     // Replace LDS_*_RET instruction that don't have any uses with the
 138     // equivalent LDS_*_NORET instruction.
 139     if (TII->isLDSRetInstr(MI->getOpcode())) {
 140       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 141       assert(DstIdx != -1);
 142       MachineInstrBuilder NewMI;
 143       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
 144         return BB;
 145
 146       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 147                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 148       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 149         NewMI.addOperand(MI->getOperand(i));
 150       }
 151     } else {
 152       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 153     }
 154     break;
 155   case AMDGPU::CLAMP_R600: {
 156     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 157                                                    AMDGPU::MOV,
 158                                                    MI->getOperand(0).getReg(),
 159                                                    MI->getOperand(1).getReg());
 160     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 161     break;
 162   }
 163
 164   case AMDGPU::FABS_R600: {
 165     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 166                                                     AMDGPU::MOV,
 167                                                     MI->getOperand(0).getReg(),
 168                                                     MI->getOperand(1).getReg());
 169     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 170     break;
 171   }
 172
 173   case AMDGPU::FNEG_R600: {
 174     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 175                                                     AMDGPU::MOV,
 176                                                     MI->getOperand(0).getReg(),
 177                                                     MI->getOperand(1).getReg());
 178     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 179     break;
 180   }
 181
 182   case AMDGPU::MASK_WRITE: {
 183     unsigned maskedRegister = MI->getOperand(0).getReg();
 184     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 185     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 186     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 187     break;
 188   }
 189
 190   case AMDGPU::MOV_IMM_F32:
 191     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 192                      MI->getOperand(1).getFPImm()->getValueAPF()
 193                          .bitcastToAPInt().getZExtValue());
 194     break;
 195   case AMDGPU::MOV_IMM_I32:
 196     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 197                      MI->getOperand(1).getImm());
 198     break;
 199   case AMDGPU::CONST_COPY: {
 200     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 201         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 202     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 203         MI->getOperand(1).getImm());
 204     break;
 205   }
 206
 207   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 208   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 209   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 210     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 211
 212     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 213             .addOperand(MI->getOperand(0))
 214             .addOperand(MI->getOperand(1))
 215             .addImm(EOP); // Set End of program bit
 216     break;
 217   }
 218
 219   case AMDGPU::TXD: {
 220     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 221     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 222     MachineOperand &RID = MI->getOperand(4);
 223     MachineOperand &SID = MI->getOperand(5);
 224     unsigned TextureId = MI->getOperand(6).getImm();
 225     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 226     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 227
 228     switch (TextureId) {
 229     case 5: // Rect
 230       CTX = CTY = 0;
 231       break;
 232     case 6: // Shadow1D
 233       SrcW = SrcZ;
 234       break;
 235     case 7: // Shadow2D
 236       SrcW = SrcZ;
 237       break;
 238     case 8: // ShadowRect
 239       CTX = CTY = 0;
 240       SrcW = SrcZ;
 241       break;
 242     case 9: // 1DArray
 243       SrcZ = SrcY;
 244       CTZ = 0;
 245       break;
 246     case 10: // 2DArray
 247       CTZ = 0;
 248       break;
 249     case 11: // Shadow1DArray
 250       SrcZ = SrcY;
 251       CTZ = 0;
 252       break;
 253     case 12: // Shadow2DArray
 254       CTZ = 0;
 255       break;
 256     }
 257     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 258             .addOperand(MI->getOperand(3))
 259             .addImm(SrcX)
 260             .addImm(SrcY)
 261             .addImm(SrcZ)
 262             .addImm(SrcW)
 263             .addImm(0)
 264             .addImm(0)
 265             .addImm(0)
 266             .addImm(0)
 267             .addImm(1)
 268             .addImm(2)
 269             .addImm(3)
 270             .addOperand(RID)
 271             .addOperand(SID)
 272             .addImm(CTX)
 273             .addImm(CTY)
 274             .addImm(CTZ)
 275             .addImm(CTW);
 276     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 277             .addOperand(MI->getOperand(2))
 278             .addImm(SrcX)
 279             .addImm(SrcY)
 280             .addImm(SrcZ)
 281             .addImm(SrcW)
 282             .addImm(0)
 283             .addImm(0)
 284             .addImm(0)
 285             .addImm(0)
 286             .addImm(1)
 287             .addImm(2)
 288             .addImm(3)
 289             .addOperand(RID)
 290             .addOperand(SID)
 291             .addImm(CTX)
 292             .addImm(CTY)
 293             .addImm(CTZ)
 294             .addImm(CTW);
 295     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 296             .addOperand(MI->getOperand(0))
 297             .addOperand(MI->getOperand(1))
 298             .addImm(SrcX)
 299             .addImm(SrcY)
 300             .addImm(SrcZ)
 301             .addImm(SrcW)
 302             .addImm(0)
 303             .addImm(0)
 304             .addImm(0)
 305             .addImm(0)
 306             .addImm(1)
 307             .addImm(2)
 308             .addImm(3)
 309             .addOperand(RID)
 310             .addOperand(SID)
 311             .addImm(CTX)
 312             .addImm(CTY)
 313             .addImm(CTZ)
 314             .addImm(CTW)
 315             .addReg(T0, RegState::Implicit)
 316             .addReg(T1, RegState::Implicit);
 317     break;
 318   }
 319
 320   case AMDGPU::TXD_SHADOW: {
 321     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 322     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 323     MachineOperand &RID = MI->getOperand(4);
 324     MachineOperand &SID = MI->getOperand(5);
 325     unsigned TextureId = MI->getOperand(6).getImm();
 326     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 327     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 328
 329     switch (TextureId) {
 330     case 5: // Rect
 331       CTX = CTY = 0;
 332       break;
 333     case 6: // Shadow1D
 334       SrcW = SrcZ;
 335       break;
 336     case 7: // Shadow2D
 337       SrcW = SrcZ;
 338       break;
 339     case 8: // ShadowRect
 340       CTX = CTY = 0;
 341       SrcW = SrcZ;
 342       break;
 343     case 9: // 1DArray
 344       SrcZ = SrcY;
 345       CTZ = 0;
 346       break;
 347     case 10: // 2DArray
 348       CTZ = 0;
 349       break;
 350     case 11: // Shadow1DArray
 351       SrcZ = SrcY;
 352       CTZ = 0;
 353       break;
 354     case 12: // Shadow2DArray
 355       CTZ = 0;
 356       break;
 357     }
 358
 359     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 360             .addOperand(MI->getOperand(3))
 361             .addImm(SrcX)
 362             .addImm(SrcY)
 363             .addImm(SrcZ)
 364             .addImm(SrcW)
 365             .addImm(0)
 366             .addImm(0)
 367             .addImm(0)
 368             .addImm(0)
 369             .addImm(1)
 370             .addImm(2)
 371             .addImm(3)
 372             .addOperand(RID)
 373             .addOperand(SID)
 374             .addImm(CTX)
 375             .addImm(CTY)
 376             .addImm(CTZ)
 377             .addImm(CTW);
 378     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 379             .addOperand(MI->getOperand(2))
 380             .addImm(SrcX)
 381             .addImm(SrcY)
 382             .addImm(SrcZ)
 383             .addImm(SrcW)
 384             .addImm(0)
 385             .addImm(0)
 386             .addImm(0)
 387             .addImm(0)
 388             .addImm(1)
 389             .addImm(2)
 390             .addImm(3)
 391             .addOperand(RID)
 392             .addOperand(SID)
 393             .addImm(CTX)
 394             .addImm(CTY)
 395             .addImm(CTZ)
 396             .addImm(CTW);
 397     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 398             .addOperand(MI->getOperand(0))
 399             .addOperand(MI->getOperand(1))
 400             .addImm(SrcX)
 401             .addImm(SrcY)
 402             .addImm(SrcZ)
 403             .addImm(SrcW)
 404             .addImm(0)
 405             .addImm(0)
 406             .addImm(0)
 407             .addImm(0)
 408             .addImm(1)
 409             .addImm(2)
 410             .addImm(3)
 411             .addOperand(RID)
 412             .addOperand(SID)
 413             .addImm(CTX)
 414             .addImm(CTY)
 415             .addImm(CTZ)
 416             .addImm(CTW)
 417             .addReg(T0, RegState::Implicit)
 418             .addReg(T1, RegState::Implicit);
 419     break;
 420   }
 421
 422   case AMDGPU::BRANCH:
 423       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 424               .addOperand(MI->getOperand(0));
 425       break;
 426
 427   case AMDGPU::BRANCH_COND_f32: {
 428     MachineInstr *NewMI =
 429       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 430               AMDGPU::PREDICATE_BIT)
 431               .addOperand(MI->getOperand(1))
 432               .addImm(OPCODE_IS_NOT_ZERO)
 433               .addImm(0); // Flags
 434     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 435     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 436             .addOperand(MI->getOperand(0))
 437             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 438     break;
 439   }
 440
 441   case AMDGPU::BRANCH_COND_i32: {
 442     MachineInstr *NewMI =
 443       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 444             AMDGPU::PREDICATE_BIT)
 445             .addOperand(MI->getOperand(1))
 446             .addImm(OPCODE_IS_NOT_ZERO_INT)
 447             .addImm(0); // Flags
 448     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 449     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 450            .addOperand(MI->getOperand(0))
 451             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 452     break;
 453   }
 454
 455   case AMDGPU::EG_ExportSwz:
 456   case AMDGPU::R600_ExportSwz: {
 457     // Instruction is left unmodified if its not the last one of its type
 458     bool isLastInstructionOfItsType = true;
 459     unsigned InstExportType = MI->getOperand(1).getImm();
 460     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 461          EndBlock = BB->end(); NextExportInst != EndBlock;
 462          NextExportInst = std::next(NextExportInst)) {
 463       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 464           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 465         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 466             .getImm();
 467         if (CurrentInstExportType == InstExportType) {
 468           isLastInstructionOfItsType = false;
 469           break;
 470         }
 471       }
 472     }
 473     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 474     if (!EOP && !isLastInstructionOfItsType)
 475       return BB;
 476     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 477     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 478             .addOperand(MI->getOperand(0))
 479             .addOperand(MI->getOperand(1))
 480             .addOperand(MI->getOperand(2))
 481             .addOperand(MI->getOperand(3))
 482             .addOperand(MI->getOperand(4))
 483             .addOperand(MI->getOperand(5))
 484             .addOperand(MI->getOperand(6))
 485             .addImm(CfInst)
 486             .addImm(EOP);
 487     break;
 488   }
 489   case AMDGPU::RETURN: {
 490     // RETURN instructions must have the live-out registers as implicit uses,
 491     // otherwise they appear dead.
 492     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 493     MachineInstrBuilder MIB(*MF, MI);
 494     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 495       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 496     return BB;
 497   }
 498   }
 499
 500   MI->eraseFromParent();
 501   return BB;
 502 }
 503
 504 //===----------------------------------------------------------------------===//
 505 // Custom DAG Lowering Operations
 506 //===----------------------------------------------------------------------===//
 507
 508 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 509   MachineFunction &MF = DAG.getMachineFunction();
 510   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 511   switch (Op.getOpcode()) {
 512   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 513   case ISD::FCOS:
 514   case ISD::FSIN: return LowerTrig(Op, DAG);
 515   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 516   case ISD::STORE: return LowerSTORE(Op, DAG);
 517   case ISD::LOAD: return LowerLOAD(Op, DAG);
 518   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 519   case ISD::INTRINSIC_VOID: {
 520     SDValue Chain = Op.getOperand(0);
 521     unsigned IntrinsicID =
 522                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 523     switch (IntrinsicID) {
 524     case AMDGPUIntrinsic::AMDGPU_store_output: {
 525       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 526       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 527       MFI->LiveOuts.push_back(Reg);
 528       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 529     }
 530     case AMDGPUIntrinsic::R600_store_swizzle: {
 531       const SDValue Args[8] = {
 532         Chain,
 533         Op.getOperand(2), // Export Value
 534         Op.getOperand(3), // ArrayBase
 535         Op.getOperand(4), // Type
 536         DAG.getConstant(0, MVT::i32), // SWZ_X
 537         DAG.getConstant(1, MVT::i32), // SWZ_Y
 538         DAG.getConstant(2, MVT::i32), // SWZ_Z
 539         DAG.getConstant(3, MVT::i32) // SWZ_W
 540       };
 541       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 542           Args, 8);
 543     }
 544
 545     // default for switch(IntrinsicID)
 546     default: break;
 547     }
 548     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 549     break;
 550   }
 551   case ISD::INTRINSIC_WO_CHAIN: {
 552     unsigned IntrinsicID =
 553                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 554     EVT VT = Op.getValueType();
 555     SDLoc DL(Op);
 556     switch(IntrinsicID) {
 557     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 558     case AMDGPUIntrinsic::R600_load_input: {
 559       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 560       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 561       MachineFunction &MF = DAG.getMachineFunction();
 562       MachineRegisterInfo &MRI = MF.getRegInfo();
 563       MRI.addLiveIn(Reg);
 564       return DAG.getCopyFromReg(DAG.getEntryNode(),
 565           SDLoc(DAG.getEntryNode()), Reg, VT);
 566     }
 567
 568     case AMDGPUIntrinsic::R600_interp_input: {
 569       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 570       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 571       MachineSDNode *interp;
 572       if (ijb < 0) {
 573         const MachineFunction &MF = DAG.getMachineFunction();
 574         const R600InstrInfo *TII =
 575           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 576         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 577             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 578         return DAG.getTargetExtractSubreg(
 579             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 580             DL, MVT::f32, SDValue(interp, 0));
 581       }
 582       MachineFunction &MF = DAG.getMachineFunction();
 583       MachineRegisterInfo &MRI = MF.getRegInfo();
 584       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 585       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 586       MRI.addLiveIn(RegisterI);
 587       MRI.addLiveIn(RegisterJ);
 588       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 589           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 590       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 591           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 592
 593       if (slot % 4 < 2)
 594         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 595             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 596             RegisterJNode, RegisterINode);
 597       else
 598         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 599             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 600             RegisterJNode, RegisterINode);
 601       return SDValue(interp, slot % 2);
 602     }
 603     case AMDGPUIntrinsic::R600_interp_xy:
 604     case AMDGPUIntrinsic::R600_interp_zw: {
 605       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 606       MachineSDNode *interp;
 607       SDValue RegisterINode = Op.getOperand(2);
 608       SDValue RegisterJNode = Op.getOperand(3);
 609
 610       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 611         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 612             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 613             RegisterJNode, RegisterINode);
 614       else
 615         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 616             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 617             RegisterJNode, RegisterINode);
 618       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 619           SDValue(interp, 0), SDValue(interp, 1));
 620     }
 621     case AMDGPUIntrinsic::R600_tex:
 622     case AMDGPUIntrinsic::R600_texc:
 623     case AMDGPUIntrinsic::R600_txl:
 624     case AMDGPUIntrinsic::R600_txlc:
 625     case AMDGPUIntrinsic::R600_txb:
 626     case AMDGPUIntrinsic::R600_txbc:
 627     case AMDGPUIntrinsic::R600_txf:
 628     case AMDGPUIntrinsic::R600_txq:
 629     case AMDGPUIntrinsic::R600_ddx:
 630     case AMDGPUIntrinsic::R600_ddy:
 631     case AMDGPUIntrinsic::R600_ldptr: {
 632       unsigned TextureOp;
 633       switch (IntrinsicID) {
 634       case AMDGPUIntrinsic::R600_tex:
 635         TextureOp = 0;
 636         break;
 637       case AMDGPUIntrinsic::R600_texc:
 638         TextureOp = 1;
 639         break;
 640       case AMDGPUIntrinsic::R600_txl:
 641         TextureOp = 2;
 642         break;
 643       case AMDGPUIntrinsic::R600_txlc:
 644         TextureOp = 3;
 645         break;
 646       case AMDGPUIntrinsic::R600_txb:
 647         TextureOp = 4;
 648         break;
 649       case AMDGPUIntrinsic::R600_txbc:
 650         TextureOp = 5;
 651         break;
 652       case AMDGPUIntrinsic::R600_txf:
 653         TextureOp = 6;
 654         break;
 655       case AMDGPUIntrinsic::R600_txq:
 656         TextureOp = 7;
 657         break;
 658       case AMDGPUIntrinsic::R600_ddx:
 659         TextureOp = 8;
 660         break;
 661       case AMDGPUIntrinsic::R600_ddy:
 662         TextureOp = 9;
 663         break;
 664       case AMDGPUIntrinsic::R600_ldptr:
 665         TextureOp = 10;
 666         break;
 667       default:
 668         llvm_unreachable("Unknow Texture Operation");
 669       }
 670
 671       SDValue TexArgs[19] = {
 672         DAG.getConstant(TextureOp, MVT::i32),
 673         Op.getOperand(1),
 674         DAG.getConstant(0, MVT::i32),
 675         DAG.getConstant(1, MVT::i32),
 676         DAG.getConstant(2, MVT::i32),
 677         DAG.getConstant(3, MVT::i32),
 678         Op.getOperand(2),
 679         Op.getOperand(3),
 680         Op.getOperand(4),
 681         DAG.getConstant(0, MVT::i32),
 682         DAG.getConstant(1, MVT::i32),
 683         DAG.getConstant(2, MVT::i32),
 684         DAG.getConstant(3, MVT::i32),
 685         Op.getOperand(5),
 686         Op.getOperand(6),
 687         Op.getOperand(7),
 688         Op.getOperand(8),
 689         Op.getOperand(9),
 690         Op.getOperand(10)
 691       };
 692       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 693     }
 694     case AMDGPUIntrinsic::AMDGPU_dp4: {
 695       SDValue Args[8] = {
 696       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 697           DAG.getConstant(0, MVT::i32)),
 698       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 699           DAG.getConstant(0, MVT::i32)),
 700       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 701           DAG.getConstant(1, MVT::i32)),
 702       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 703           DAG.getConstant(1, MVT::i32)),
 704       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 705           DAG.getConstant(2, MVT::i32)),
 706       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 707           DAG.getConstant(2, MVT::i32)),
 708       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 709           DAG.getConstant(3, MVT::i32)),
 710       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 711           DAG.getConstant(3, MVT::i32))
 712       };
 713       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 714     }
 715
 716     case Intrinsic::r600_read_ngroups_x:
 717       return LowerImplicitParameter(DAG, VT, DL, 0);
 718     case Intrinsic::r600_read_ngroups_y:
 719       return LowerImplicitParameter(DAG, VT, DL, 1);
 720     case Intrinsic::r600_read_ngroups_z:
 721       return LowerImplicitParameter(DAG, VT, DL, 2);
 722     case Intrinsic::r600_read_global_size_x:
 723       return LowerImplicitParameter(DAG, VT, DL, 3);
 724     case Intrinsic::r600_read_global_size_y:
 725       return LowerImplicitParameter(DAG, VT, DL, 4);
 726     case Intrinsic::r600_read_global_size_z:
 727       return LowerImplicitParameter(DAG, VT, DL, 5);
 728     case Intrinsic::r600_read_local_size_x:
 729       return LowerImplicitParameter(DAG, VT, DL, 6);
 730     case Intrinsic::r600_read_local_size_y:
 731       return LowerImplicitParameter(DAG, VT, DL, 7);
 732     case Intrinsic::r600_read_local_size_z:
 733       return LowerImplicitParameter(DAG, VT, DL, 8);
 734
 735     case Intrinsic::r600_read_tgid_x:
 736       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 737                                   AMDGPU::T1_X, VT);
 738     case Intrinsic::r600_read_tgid_y:
 739       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 740                                   AMDGPU::T1_Y, VT);
 741     case Intrinsic::r600_read_tgid_z:
 742       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 743                                   AMDGPU::T1_Z, VT);
 744     case Intrinsic::r600_read_tidig_x:
 745       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 746                                   AMDGPU::T0_X, VT);
 747     case Intrinsic::r600_read_tidig_y:
 748       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 749                                   AMDGPU::T0_Y, VT);
 750     case Intrinsic::r600_read_tidig_z:
 751       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 752                                   AMDGPU::T0_Z, VT);
 753     }
 754     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 755     break;
 756   }
 757   } // end switch(Op.getOpcode())
 758   return SDValue();
 759 }
 760
 761 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 762                                             SmallVectorImpl<SDValue> &Results,
 763                                             SelectionDAG &DAG) const {
 764   switch (N->getOpcode()) {
 765   default:
 766     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 767     return;
 768   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 769     return;
 770   case ISD::LOAD: {
 771     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 772     Results.push_back(SDValue(Node, 0));
 773     Results.push_back(SDValue(Node, 1));
 774     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 775     // function
 776     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 777     return;
 778   }
 779   case ISD::STORE:
 780     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 781     Results.push_back(SDValue(Node, 0));
 782     return;
 783   }
 784 }
 785
 786 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 787   // On hw >= R700, COS/SIN input must be between -1. and 1.
 788   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 789   EVT VT = Op.getValueType();
 790   SDValue Arg = Op.getOperand(0);
 791   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 792       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 793         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 794           DAG.getConstantFP(0.15915494309, MVT::f32)),
 795         DAG.getConstantFP(0.5, MVT::f32)));
 796   unsigned TrigNode;
 797   switch (Op.getOpcode()) {
 798   case ISD::FCOS:
 799     TrigNode = AMDGPUISD::COS_HW;
 800     break;
 801   case ISD::FSIN:
 802     TrigNode = AMDGPUISD::SIN_HW;
 803     break;
 804   default:
 805     llvm_unreachable("Wrong trig opcode");
 806   }
 807   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 808       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 809         DAG.getConstantFP(-0.5, MVT::f32)));
 810   if (Gen >= AMDGPUSubtarget::R700)
 811     return TrigVal;
 812   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 813   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 814       DAG.getConstantFP(3.14159265359, MVT::f32));
 815 }
 816
 817 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 818   return DAG.getNode(
 819       ISD::SETCC,
 820       SDLoc(Op),
 821       MVT::i1,
 822       Op, DAG.getConstantFP(0.0f, MVT::f32),
 823       DAG.getCondCode(ISD::SETNE)
 824       );
 825 }
 826
 827 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 828                                                    SDLoc DL,
 829                                                    unsigned DwordOffset) const {
 830   unsigned ByteOffset = DwordOffset * 4;
 831   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 832                                       AMDGPUAS::CONSTANT_BUFFER_0);
 833
 834   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 835   assert(isInt<16>(ByteOffset));
 836
 837   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 838                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 839                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 840                      false, false, false, 0);
 841 }
 842
 843 bool R600TargetLowering::isZero(SDValue Op) const {
 844   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 845     return Cst->isNullValue();
 846   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 847     return CstFP->isZero();
 848   } else {
 849     return false;
 850   }
 851 }
 852
 853 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 854   SDLoc DL(Op);
 855   EVT VT = Op.getValueType();
 856
 857   SDValue LHS = Op.getOperand(0);
 858   SDValue RHS = Op.getOperand(1);
 859   SDValue True = Op.getOperand(2);
 860   SDValue False = Op.getOperand(3);
 861   SDValue CC = Op.getOperand(4);
 862   SDValue Temp;
 863
 864   // LHS and RHS are guaranteed to be the same value type
 865   EVT CompareVT = LHS.getValueType();
 866
 867   // Check if we can lower this to a native operation.
 868
 869   // Try to lower to a SET* instruction:
 870   //
 871   // SET* can match the following patterns:
 872   //
 873   // select_cc f32, f32, -1,  0, cc_supported
 874   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
 875   // select_cc i32, i32, -1,  0, cc_supported
 876   //
 877
 878   // Move hardware True/False values to the correct operand.
 879   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 880   ISD::CondCode InverseCC =
 881      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 882   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 883     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
 884       std::swap(False, True);
 885       CC = DAG.getCondCode(InverseCC);
 886     } else {
 887       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
 888       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
 889         std::swap(False, True);
 890         std::swap(LHS, RHS);
 891         CC = DAG.getCondCode(SwapInvCC);
 892       }
 893     }
 894   }
 895
 896   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 897       (CompareVT == VT || VT == MVT::i32)) {
 898     // This can be matched by a SET* instruction.
 899     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 900   }
 901
 902   // Try to lower to a CND* instruction:
 903   //
 904   // CND* can match the following patterns:
 905   //
 906   // select_cc f32, 0.0, f32, f32, cc_supported
 907   // select_cc f32, 0.0, i32, i32, cc_supported
 908   // select_cc i32, 0,   f32, f32, cc_supported
 909   // select_cc i32, 0,   i32, i32, cc_supported
 910   //
 911
 912   // Try to move the zero value to the RHS
 913   if (isZero(LHS)) {
 914     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 915     // Try swapping the operands
 916     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
 917     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
 918       std::swap(LHS, RHS);
 919       CC = DAG.getCondCode(CCSwapped);
 920     } else {
 921       // Try inverting the conditon and then swapping the operands
 922       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
 923       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
 924       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
 925         std::swap(True, False);
 926         std::swap(LHS, RHS);
 927         CC = DAG.getCondCode(CCSwapped);
 928       }
 929     }
 930   }
 931   if (isZero(RHS)) {
 932     SDValue Cond = LHS;
 933     SDValue Zero = RHS;
 934     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 935     if (CompareVT != VT) {
 936       // Bitcast True / False to the correct types.  This will end up being
 937       // a nop, but it allows us to define only a single pattern in the
 938       // .TD files for each CND* instruction rather than having to have
 939       // one pattern for integer True/False and one for fp True/False
 940       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 941       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 942     }
 943
 944     switch (CCOpcode) {
 945     case ISD::SETONE:
 946     case ISD::SETUNE:
 947     case ISD::SETNE:
 948       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 949       Temp = True;
 950       True = False;
 951       False = Temp;
 952       break;
 953     default:
 954       break;
 955     }
 956     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 957         Cond, Zero,
 958         True, False,
 959         DAG.getCondCode(CCOpcode));
 960     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 961   }
 962
 963
 964   // Possible Min/Max pattern
 965   SDValue MinMax = LowerMinMax(Op, DAG);
 966   if (MinMax.getNode()) {
 967     return MinMax;
 968   }
 969
 970   // If we make it this for it means we have no native instructions to handle
 971   // this SELECT_CC, so we must lower it.
 972   SDValue HWTrue, HWFalse;
 973
 974   if (CompareVT == MVT::f32) {
 975     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 976     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 977   } else if (CompareVT == MVT::i32) {
 978     HWTrue = DAG.getConstant(-1, CompareVT);
 979     HWFalse = DAG.getConstant(0, CompareVT);
 980   }
 981   else {
 982     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
 983   }
 984
 985   // Lower this unsupported SELECT_CC into a combination of two supported
 986   // SELECT_CC operations.
 987   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 988
 989   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 990       Cond, HWFalse,
 991       True, False,
 992       DAG.getCondCode(ISD::SETNE));
 993 }
 994
 995 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
 996 /// convert these pointers to a register index.  Each register holds
 997 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 998 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 999 /// for indirect addressing.
1000 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1001                                                unsigned StackWidth,
1002                                                SelectionDAG &DAG) const {
1003   unsigned SRLPad;
1004   switch(StackWidth) {
1005   case 1:
1006     SRLPad = 2;
1007     break;
1008   case 2:
1009     SRLPad = 3;
1010     break;
1011   case 4:
1012     SRLPad = 4;
1013     break;
1014   default: llvm_unreachable("Invalid stack width");
1015   }
1016
1017   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1018                      DAG.getConstant(SRLPad, MVT::i32));
1019 }
1020
1021 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1022                                          unsigned ElemIdx,
1023                                          unsigned &Channel,
1024                                          unsigned &PtrIncr) const {
1025   switch (StackWidth) {
1026   default:
1027   case 1:
1028     Channel = 0;
1029     if (ElemIdx > 0) {
1030       PtrIncr = 1;
1031     } else {
1032       PtrIncr = 0;
1033     }
1034     break;
1035   case 2:
1036     Channel = ElemIdx % 2;
1037     if (ElemIdx == 2) {
1038       PtrIncr = 1;
1039     } else {
1040       PtrIncr = 0;
1041     }
1042     break;
1043   case 4:
1044     Channel = ElemIdx;
1045     PtrIncr = 0;
1046     break;
1047   }
1048 }
1049
1050 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1051   SDLoc DL(Op);
1052   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1053   SDValue Chain = Op.getOperand(0);
1054   SDValue Value = Op.getOperand(1);
1055   SDValue Ptr = Op.getOperand(2);
1056
1057   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1058   if (Result.getNode()) {
1059     return Result;
1060   }
1061
1062   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1063     if (StoreNode->isTruncatingStore()) {
1064       EVT VT = Value.getValueType();
1065       assert(VT.bitsLE(MVT::i32));
1066       EVT MemVT = StoreNode->getMemoryVT();
1067       SDValue MaskConstant;
1068       if (MemVT == MVT::i8) {
1069         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1070       } else {
1071         assert(MemVT == MVT::i16);
1072         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1073       }
1074       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1075                                       DAG.getConstant(2, MVT::i32));
1076       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1077                                       DAG.getConstant(0x00000003, VT));
1078       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1079       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1080                                    DAG.getConstant(3, VT));
1081       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1082       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1083       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1084       // vector instead.
1085       SDValue Src[4] = {
1086         ShiftedValue,
1087         DAG.getConstant(0, MVT::i32),
1088         DAG.getConstant(0, MVT::i32),
1089         Mask
1090       };
1091       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1092       SDValue Args[3] = { Chain, Input, DWordAddr };
1093       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1094                                      Op->getVTList(), Args, 3, MemVT,
1095                                      StoreNode->getMemOperand());
1096     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1097                Value.getValueType().bitsGE(MVT::i32)) {
1098       // Convert pointer from byte address to dword address.
1099       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1100                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1101                                     Ptr, DAG.getConstant(2, MVT::i32)));
1102
1103       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1104         llvm_unreachable("Truncated and indexed stores not supported yet");
1105       } else {
1106         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1107       }
1108       return Chain;
1109     }
1110   }
1111
1112   EVT ValueVT = Value.getValueType();
1113
1114   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1115     return SDValue();
1116   }
1117
1118   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1119   if (Ret.getNode()) {
1120     return Ret;
1121   }
1122   // Lowering for indirect addressing
1123
1124   const MachineFunction &MF = DAG.getMachineFunction();
1125   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1126                                          getTargetMachine().getFrameLowering());
1127   unsigned StackWidth = TFL->getStackWidth(MF);
1128
1129   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1130
1131   if (ValueVT.isVector()) {
1132     unsigned NumElemVT = ValueVT.getVectorNumElements();
1133     EVT ElemVT = ValueVT.getVectorElementType();
1134     SDValue Stores[4];
1135
1136     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1137                                       "vector width in load");
1138
1139     for (unsigned i = 0; i < NumElemVT; ++i) {
1140       unsigned Channel, PtrIncr;
1141       getStackAddress(StackWidth, i, Channel, PtrIncr);
1142       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1143                         DAG.getConstant(PtrIncr, MVT::i32));
1144       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1145                                  Value, DAG.getConstant(i, MVT::i32));
1146
1147       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1148                               Chain, Elem, Ptr,
1149                               DAG.getTargetConstant(Channel, MVT::i32));
1150     }
1151      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1152    } else {
1153     if (ValueVT == MVT::i8) {
1154       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1155     }
1156     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1157     DAG.getTargetConstant(0, MVT::i32)); // Channel
1158   }
1159
1160   return Chain;
1161 }
1162
1163 // return (512 + (kc_bank << 12)
1164 static int
1165 ConstantAddressBlock(unsigned AddressSpace) {
1166   switch (AddressSpace) {
1167   case AMDGPUAS::CONSTANT_BUFFER_0:
1168     return 512;
1169   case AMDGPUAS::CONSTANT_BUFFER_1:
1170     return 512 + 4096;
1171   case AMDGPUAS::CONSTANT_BUFFER_2:
1172     return 512 + 4096 * 2;
1173   case AMDGPUAS::CONSTANT_BUFFER_3:
1174     return 512 + 4096 * 3;
1175   case AMDGPUAS::CONSTANT_BUFFER_4:
1176     return 512 + 4096 * 4;
1177   case AMDGPUAS::CONSTANT_BUFFER_5:
1178     return 512 + 4096 * 5;
1179   case AMDGPUAS::CONSTANT_BUFFER_6:
1180     return 512 + 4096 * 6;
1181   case AMDGPUAS::CONSTANT_BUFFER_7:
1182     return 512 + 4096 * 7;
1183   case AMDGPUAS::CONSTANT_BUFFER_8:
1184     return 512 + 4096 * 8;
1185   case AMDGPUAS::CONSTANT_BUFFER_9:
1186     return 512 + 4096 * 9;
1187   case AMDGPUAS::CONSTANT_BUFFER_10:
1188     return 512 + 4096 * 10;
1189   case AMDGPUAS::CONSTANT_BUFFER_11:
1190     return 512 + 4096 * 11;
1191   case AMDGPUAS::CONSTANT_BUFFER_12:
1192     return 512 + 4096 * 12;
1193   case AMDGPUAS::CONSTANT_BUFFER_13:
1194     return 512 + 4096 * 13;
1195   case AMDGPUAS::CONSTANT_BUFFER_14:
1196     return 512 + 4096 * 14;
1197   case AMDGPUAS::CONSTANT_BUFFER_15:
1198     return 512 + 4096 * 15;
1199   default:
1200     return -1;
1201   }
1202 }
1203
1204 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1205 {
1206   EVT VT = Op.getValueType();
1207   SDLoc DL(Op);
1208   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1209   SDValue Chain = Op.getOperand(0);
1210   SDValue Ptr = Op.getOperand(1);
1211   SDValue LoweredLoad;
1212
1213   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1214   if (Ret.getNode()) {
1215     SDValue Ops[2];
1216     Ops[0] = Ret;
1217     Ops[1] = Chain;
1218     return DAG.getMergeValues(Ops, 2, DL);
1219   }
1220
1221
1222   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1223     SDValue MergedValues[2] = {
1224       SplitVectorLoad(Op, DAG),
1225       Chain
1226     };
1227     return DAG.getMergeValues(MergedValues, 2, DL);
1228   }
1229
1230   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1231   if (ConstantBlock > -1 &&
1232       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1233        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1234     SDValue Result;
1235     if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
1236         isa<Constant>(LoadNode->getSrcValue()) ||
1237         isa<ConstantSDNode>(Ptr)) {
1238       SDValue Slots[4];
1239       for (unsigned i = 0; i < 4; i++) {
1240         // We want Const position encoded with the following formula :
1241         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1242         // const_index is Ptr computed by llvm using an alignment of 16.
1243         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1244         // then div by 4 at the ISel step
1245         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1246             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1247         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1248       }
1249       EVT NewVT = MVT::v4i32;
1250       unsigned NumElements = 4;
1251       if (VT.isVector()) {
1252         NewVT = VT;
1253         NumElements = VT.getVectorNumElements();
1254       }
1255       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1256     } else {
1257       // non-constant ptr can't be folded, keeps it as a v4f32 load
1258       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1259           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1260           DAG.getConstant(LoadNode->getAddressSpace() -
1261                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1262           );
1263     }
1264
1265     if (!VT.isVector()) {
1266       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1267           DAG.getConstant(0, MVT::i32));
1268     }
1269
1270     SDValue MergedValues[2] = {
1271         Result,
1272         Chain
1273     };
1274     return DAG.getMergeValues(MergedValues, 2, DL);
1275   }
1276
1277   // For most operations returning SDValue() will result in the node being
1278   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1279   // need to manually expand loads that may be legal in some address spaces and
1280   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1281   // compute shaders, since the data is sign extended when it is uploaded to the
1282   // buffer. However SEXT loads from other address spaces are not supported, so
1283   // we need to expand them here.
1284   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1285     EVT MemVT = LoadNode->getMemoryVT();
1286     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1287     SDValue ShiftAmount =
1288           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1289     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1290                                   LoadNode->getPointerInfo(), MemVT,
1291                                   LoadNode->isVolatile(),
1292                                   LoadNode->isNonTemporal(),
1293                                   LoadNode->getAlignment());
1294     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1295     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1296
1297     SDValue MergedValues[2] = { Sra, Chain };
1298     return DAG.getMergeValues(MergedValues, 2, DL);
1299   }
1300
1301   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1302     return SDValue();
1303   }
1304
1305   // Lowering for indirect addressing
1306   const MachineFunction &MF = DAG.getMachineFunction();
1307   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1308                                          getTargetMachine().getFrameLowering());
1309   unsigned StackWidth = TFL->getStackWidth(MF);
1310
1311   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1312
1313   if (VT.isVector()) {
1314     unsigned NumElemVT = VT.getVectorNumElements();
1315     EVT ElemVT = VT.getVectorElementType();
1316     SDValue Loads[4];
1317
1318     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1319                                       "vector width in load");
1320
1321     for (unsigned i = 0; i < NumElemVT; ++i) {
1322       unsigned Channel, PtrIncr;
1323       getStackAddress(StackWidth, i, Channel, PtrIncr);
1324       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1325                         DAG.getConstant(PtrIncr, MVT::i32));
1326       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1327                              Chain, Ptr,
1328                              DAG.getTargetConstant(Channel, MVT::i32),
1329                              Op.getOperand(2));
1330     }
1331     for (unsigned i = NumElemVT; i < 4; ++i) {
1332       Loads[i] = DAG.getUNDEF(ElemVT);
1333     }
1334     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1335     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1336   } else {
1337     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1338                               Chain, Ptr,
1339                               DAG.getTargetConstant(0, MVT::i32), // Channel
1340                               Op.getOperand(2));
1341   }
1342
1343   SDValue Ops[2];
1344   Ops[0] = LoweredLoad;
1345   Ops[1] = Chain;
1346
1347   return DAG.getMergeValues(Ops, 2, DL);
1348 }
1349
1350 /// XXX Only kernel functions are supported, so we can assume for now that
1351 /// every function is a kernel function, but in the future we should use
1352 /// separate calling conventions for kernel and non-kernel functions.
1353 SDValue R600TargetLowering::LowerFormalArguments(
1354                                       SDValue Chain,
1355                                       CallingConv::ID CallConv,
1356                                       bool isVarArg,
1357                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1358                                       SDLoc DL, SelectionDAG &DAG,
1359                                       SmallVectorImpl<SDValue> &InVals) const {
1360   SmallVector<CCValAssign, 16> ArgLocs;
1361   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1362                  getTargetMachine(), ArgLocs, *DAG.getContext());
1363   MachineFunction &MF = DAG.getMachineFunction();
1364   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1365
1366   SmallVector<ISD::InputArg, 8> LocalIns;
1367
1368   getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
1369                           LocalIns);
1370
1371   AnalyzeFormalArguments(CCInfo, LocalIns);
1372
1373   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1374     CCValAssign &VA = ArgLocs[i];
1375     EVT VT = Ins[i].VT;
1376     EVT MemVT = LocalIns[i].VT;
1377
1378     if (ShaderType != ShaderType::COMPUTE) {
1379       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1380       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1381       InVals.push_back(Register);
1382       continue;
1383     }
1384
1385     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1386                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1387
1388     // i64 isn't a legal type, so the register type used ends up as i32, which
1389     // isn't expected here. It attempts to create this sextload, but it ends up
1390     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1391     // for <1 x i64>.
1392
1393     // The first 36 bytes of the input buffer contains information about
1394     // thread group and global sizes.
1395     SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
1396                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1397                                  MachinePointerInfo(UndefValue::get(PtrTy)),
1398                                  MemVT, false, false, 4);
1399     // 4 is the preferred alignment for
1400     // the CONSTANT memory space.
1401     InVals.push_back(Arg);
1402   }
1403   return Chain;
1404 }
1405
1406 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1407    if (!VT.isVector()) return MVT::i32;
1408    return VT.changeVectorElementTypeToInteger();
1409 }
1410
1411 static SDValue
1412 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1413                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1414   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1415   assert(RemapSwizzle.empty());
1416   SDValue NewBldVec[4] = {
1417       VectorEntry.getOperand(0),
1418       VectorEntry.getOperand(1),
1419       VectorEntry.getOperand(2),
1420       VectorEntry.getOperand(3)
1421   };
1422
1423   for (unsigned i = 0; i < 4; i++) {
1424     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1425       // We mask write here to teach later passes that the ith element of this
1426       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1427       // break false dependencies and additionnaly make assembly easier to read.
1428       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1429     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1430       if (C->isZero()) {
1431         RemapSwizzle[i] = 4; // SEL_0
1432         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1433       } else if (C->isExactlyValue(1.0)) {
1434         RemapSwizzle[i] = 5; // SEL_1
1435         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1436       }
1437     }
1438
1439     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1440       continue;
1441     for (unsigned j = 0; j < i; j++) {
1442       if (NewBldVec[i] == NewBldVec[j]) {
1443         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1444         RemapSwizzle[i] = j;
1445         break;
1446       }
1447     }
1448   }
1449
1450   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1451       VectorEntry.getValueType(), NewBldVec, 4);
1452 }
1453
1454 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1455                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1456   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1457   assert(RemapSwizzle.empty());
1458   SDValue NewBldVec[4] = {
1459       VectorEntry.getOperand(0),
1460       VectorEntry.getOperand(1),
1461       VectorEntry.getOperand(2),
1462       VectorEntry.getOperand(3)
1463   };
1464   bool isUnmovable[4] = { false, false, false, false };
1465   for (unsigned i = 0; i < 4; i++) {
1466     RemapSwizzle[i] = i;
1467     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1468       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1469           ->getZExtValue();
1470       if (i == Idx)
1471         isUnmovable[Idx] = true;
1472     }
1473   }
1474
1475   for (unsigned i = 0; i < 4; i++) {
1476     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1477       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1478           ->getZExtValue();
1479       if (isUnmovable[Idx])
1480         continue;
1481       // Swap i and Idx
1482       std::swap(NewBldVec[Idx], NewBldVec[i]);
1483       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1484       break;
1485     }
1486   }
1487
1488   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1489       VectorEntry.getValueType(), NewBldVec, 4);
1490 }
1491
1492
1493 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1494 SDValue Swz[4], SelectionDAG &DAG) const {
1495   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1496   // Old -> New swizzle values
1497   DenseMap<unsigned, unsigned> SwizzleRemap;
1498
1499   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1500   for (unsigned i = 0; i < 4; i++) {
1501     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1502     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1503       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1504   }
1505
1506   SwizzleRemap.clear();
1507   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1508   for (unsigned i = 0; i < 4; i++) {
1509     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1510     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1511       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1512   }
1513
1514   return BuildVector;
1515 }
1516
1517
1518 //===----------------------------------------------------------------------===//
1519 // Custom DAG Optimizations
1520 //===----------------------------------------------------------------------===//
1521
1522 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1523                                               DAGCombinerInfo &DCI) const {
1524   SelectionDAG &DAG = DCI.DAG;
1525
1526   switch (N->getOpcode()) {
1527   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1528   case ISD::FP_ROUND: {
1529       SDValue Arg = N->getOperand(0);
1530       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1531         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1532                            Arg.getOperand(0));
1533       }
1534       break;
1535     }
1536
1537   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1538   // (i32 select_cc f32, f32, -1, 0 cc)
1539   //
1540   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1541   // this to one of the SET*_DX10 instructions.
1542   case ISD::FP_TO_SINT: {
1543     SDValue FNeg = N->getOperand(0);
1544     if (FNeg.getOpcode() != ISD::FNEG) {
1545       return SDValue();
1546     }
1547     SDValue SelectCC = FNeg.getOperand(0);
1548     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1549         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1550         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1551         !isHWTrueValue(SelectCC.getOperand(2)) ||
1552         !isHWFalseValue(SelectCC.getOperand(3))) {
1553       return SDValue();
1554     }
1555
1556     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1557                            SelectCC.getOperand(0), // LHS
1558                            SelectCC.getOperand(1), // RHS
1559                            DAG.getConstant(-1, MVT::i32), // True
1560                            DAG.getConstant(0, MVT::i32),  // Flase
1561                            SelectCC.getOperand(4)); // CC
1562
1563     break;
1564   }
1565
1566   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1567   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1568   case ISD::INSERT_VECTOR_ELT: {
1569     SDValue InVec = N->getOperand(0);
1570     SDValue InVal = N->getOperand(1);
1571     SDValue EltNo = N->getOperand(2);
1572     SDLoc dl(N);
1573
1574     // If the inserted element is an UNDEF, just use the input vector.
1575     if (InVal.getOpcode() == ISD::UNDEF)
1576       return InVec;
1577
1578     EVT VT = InVec.getValueType();
1579
1580     // If we can't generate a legal BUILD_VECTOR, exit
1581     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1582       return SDValue();
1583
1584     // Check that we know which element is being inserted
1585     if (!isa<ConstantSDNode>(EltNo))
1586       return SDValue();
1587     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1588
1589     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1590     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1591     // vector elements.
1592     SmallVector<SDValue, 8> Ops;
1593     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1594       Ops.append(InVec.getNode()->op_begin(),
1595                  InVec.getNode()->op_end());
1596     } else if (InVec.getOpcode() == ISD::UNDEF) {
1597       unsigned NElts = VT.getVectorNumElements();
1598       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1599     } else {
1600       return SDValue();
1601     }
1602
1603     // Insert the element
1604     if (Elt < Ops.size()) {
1605       // All the operands of BUILD_VECTOR must have the same type;
1606       // we enforce that here.
1607       EVT OpVT = Ops[0].getValueType();
1608       if (InVal.getValueType() != OpVT)
1609         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1610           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1611           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1612       Ops[Elt] = InVal;
1613     }
1614
1615     // Return the new vector
1616     return DAG.getNode(ISD::BUILD_VECTOR, dl,
1617                        VT, &Ops[0], Ops.size());
1618   }
1619
1620   // Extract_vec (Build_vector) generated by custom lowering
1621   // also needs to be customly combined
1622   case ISD::EXTRACT_VECTOR_ELT: {
1623     SDValue Arg = N->getOperand(0);
1624     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1625       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1626         unsigned Element = Const->getZExtValue();
1627         return Arg->getOperand(Element);
1628       }
1629     }
1630     if (Arg.getOpcode() == ISD::BITCAST &&
1631         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1632       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1633         unsigned Element = Const->getZExtValue();
1634         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1635             Arg->getOperand(0).getOperand(Element));
1636       }
1637     }
1638   }
1639
1640   case ISD::SELECT_CC: {
1641     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1642     //      selectcc x, y, a, b, inv(cc)
1643     //
1644     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1645     //      selectcc x, y, a, b, cc
1646     SDValue LHS = N->getOperand(0);
1647     if (LHS.getOpcode() != ISD::SELECT_CC) {
1648       return SDValue();
1649     }
1650
1651     SDValue RHS = N->getOperand(1);
1652     SDValue True = N->getOperand(2);
1653     SDValue False = N->getOperand(3);
1654     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1655
1656     if (LHS.getOperand(2).getNode() != True.getNode() ||
1657         LHS.getOperand(3).getNode() != False.getNode() ||
1658         RHS.getNode() != False.getNode()) {
1659       return SDValue();
1660     }
1661
1662     switch (NCC) {
1663     default: return SDValue();
1664     case ISD::SETNE: return LHS;
1665     case ISD::SETEQ: {
1666       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1667       LHSCC = ISD::getSetCCInverse(LHSCC,
1668                                   LHS.getOperand(0).getValueType().isInteger());
1669       if (DCI.isBeforeLegalizeOps() ||
1670           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1671         return DAG.getSelectCC(SDLoc(N),
1672                                LHS.getOperand(0),
1673                                LHS.getOperand(1),
1674                                LHS.getOperand(2),
1675                                LHS.getOperand(3),
1676                                LHSCC);
1677       break;
1678     }
1679     }
1680     return SDValue();
1681   }
1682
1683   case AMDGPUISD::EXPORT: {
1684     SDValue Arg = N->getOperand(1);
1685     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1686       break;
1687
1688     SDValue NewArgs[8] = {
1689       N->getOperand(0), // Chain
1690       SDValue(),
1691       N->getOperand(2), // ArrayBase
1692       N->getOperand(3), // Type
1693       N->getOperand(4), // SWZ_X
1694       N->getOperand(5), // SWZ_Y
1695       N->getOperand(6), // SWZ_Z
1696       N->getOperand(7) // SWZ_W
1697     };
1698     SDLoc DL(N);
1699     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1700     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1701   }
1702   case AMDGPUISD::TEXTURE_FETCH: {
1703     SDValue Arg = N->getOperand(1);
1704     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1705       break;
1706
1707     SDValue NewArgs[19] = {
1708       N->getOperand(0),
1709       N->getOperand(1),
1710       N->getOperand(2),
1711       N->getOperand(3),
1712       N->getOperand(4),
1713       N->getOperand(5),
1714       N->getOperand(6),
1715       N->getOperand(7),
1716       N->getOperand(8),
1717       N->getOperand(9),
1718       N->getOperand(10),
1719       N->getOperand(11),
1720       N->getOperand(12),
1721       N->getOperand(13),
1722       N->getOperand(14),
1723       N->getOperand(15),
1724       N->getOperand(16),
1725       N->getOperand(17),
1726       N->getOperand(18),
1727     };
1728     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1729     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1730         NewArgs, 19);
1731   }
1732   }
1733   return SDValue();
1734 }
1735
1736 static bool
1737 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
1738             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
1739   const R600InstrInfo *TII =
1740       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1741   if (!Src.isMachineOpcode())
1742     return false;
1743   switch (Src.getMachineOpcode()) {
1744   case AMDGPU::FNEG_R600:
1745     if (!Neg.getNode())
1746       return false;
1747     Src = Src.getOperand(0);
1748     Neg = DAG.getTargetConstant(1, MVT::i32);
1749     return true;
1750   case AMDGPU::FABS_R600:
1751     if (!Abs.getNode())
1752       return false;
1753     Src = Src.getOperand(0);
1754     Abs = DAG.getTargetConstant(1, MVT::i32);
1755     return true;
1756   case AMDGPU::CONST_COPY: {
1757     unsigned Opcode = ParentNode->getMachineOpcode();
1758     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1759
1760     if (!Sel.getNode())
1761       return false;
1762
1763     SDValue CstOffset = Src.getOperand(0);
1764     if (ParentNode->getValueType(0).isVector())
1765       return false;
1766
1767     // Gather constants values
1768     int SrcIndices[] = {
1769       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1770       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1771       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1772       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1773       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1774       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1775       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1776       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1777       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1778       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1779       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1780     };
1781     std::vector<unsigned> Consts;
1782     for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
1783       int OtherSrcIdx = SrcIndices[i];
1784       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1785       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1786         continue;
1787       if (HasDst) {
1788         OtherSrcIdx--;
1789         OtherSelIdx--;
1790       }
1791       if (RegisterSDNode *Reg =
1792           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1793         if (Reg->getReg() == AMDGPU::ALU_CONST) {
1794           ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
1795               ParentNode->getOperand(OtherSelIdx));
1796           Consts.push_back(Cst->getZExtValue());
1797         }
1798       }
1799     }
1800
1801     ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
1802     Consts.push_back(Cst->getZExtValue());
1803     if (!TII->fitsConstReadLimitations(Consts)) {
1804       return false;
1805     }
1806
1807     Sel = CstOffset;
1808     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
1809     return true;
1810   }
1811   case AMDGPU::MOV_IMM_I32:
1812   case AMDGPU::MOV_IMM_F32: {
1813     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
1814     uint64_t ImmValue = 0;
1815
1816
1817     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
1818       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
1819       float FloatValue = FPC->getValueAPF().convertToFloat();
1820       if (FloatValue == 0.0) {
1821         ImmReg = AMDGPU::ZERO;
1822       } else if (FloatValue == 0.5) {
1823         ImmReg = AMDGPU::HALF;
1824       } else if (FloatValue == 1.0) {
1825         ImmReg = AMDGPU::ONE;
1826       } else {
1827         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
1828       }
1829     } else {
1830       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
1831       uint64_t Value = C->getZExtValue();
1832       if (Value == 0) {
1833         ImmReg = AMDGPU::ZERO;
1834       } else if (Value == 1) {
1835         ImmReg = AMDGPU::ONE_INT;
1836       } else {
1837         ImmValue = Value;
1838       }
1839     }
1840
1841     // Check that we aren't already using an immediate.
1842     // XXX: It's possible for an instruction to have more than one
1843     // immediate operand, but this is not supported yet.
1844     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
1845       if (!Imm.getNode())
1846         return false;
1847       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
1848       assert(C);
1849       if (C->getZExtValue())
1850         return false;
1851       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
1852     }
1853     Src = DAG.getRegister(ImmReg, MVT::i32);
1854     return true;
1855   }
1856   default:
1857     return false;
1858   }
1859 }
1860
1861
1862 /// \brief Fold the instructions after selecting them
1863 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
1864                                             SelectionDAG &DAG) const {
1865   const R600InstrInfo *TII =
1866       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1867   if (!Node->isMachineOpcode())
1868     return Node;
1869   unsigned Opcode = Node->getMachineOpcode();
1870   SDValue FakeOp;
1871
1872   std::vector<SDValue> Ops;
1873   for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
1874               I != E; ++I)
1875           Ops.push_back(*I);
1876
1877   if (Opcode == AMDGPU::DOT_4) {
1878     int OperandIdx[] = {
1879       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1880       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1881       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1882       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1883       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1884       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1885       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1886       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1887         };
1888     int NegIdx[] = {
1889       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
1890       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
1891       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
1892       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
1893       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
1894       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
1895       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
1896       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
1897     };
1898     int AbsIdx[] = {
1899       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
1900       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
1901       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
1902       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
1903       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
1904       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
1905       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
1906       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
1907     };
1908     for (unsigned i = 0; i < 8; i++) {
1909       if (OperandIdx[i] < 0)
1910         return Node;
1911       SDValue &Src = Ops[OperandIdx[i] - 1];
1912       SDValue &Neg = Ops[NegIdx[i] - 1];
1913       SDValue &Abs = Ops[AbsIdx[i] - 1];
1914       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1915       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1916       if (HasDst)
1917         SelIdx--;
1918       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1919       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
1920         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1921     }
1922   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
1923     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
1924       SDValue &Src = Ops[i];
1925       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
1926         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1927     }
1928   } else if (Opcode == AMDGPU::CLAMP_R600) {
1929     SDValue Src = Node->getOperand(0);
1930     if (!Src.isMachineOpcode() ||
1931         !TII->hasInstrModifiers(Src.getMachineOpcode()))
1932       return Node;
1933     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
1934         AMDGPU::OpName::clamp);
1935     if (ClampIdx < 0)
1936       return Node;
1937     std::vector<SDValue> Ops;
1938     unsigned NumOp = Src.getNumOperands();
1939     for(unsigned i = 0; i < NumOp; ++i)
1940           Ops.push_back(Src.getOperand(i));
1941     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
1942     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
1943         Node->getVTList(), Ops);
1944   } else {
1945     if (!TII->hasInstrModifiers(Opcode))
1946       return Node;
1947     int OperandIdx[] = {
1948       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1949       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1950       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
1951     };
1952     int NegIdx[] = {
1953       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
1954       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
1955       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
1956     };
1957     int AbsIdx[] = {
1958       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
1959       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
1960       -1
1961     };
1962     for (unsigned i = 0; i < 3; i++) {
1963       if (OperandIdx[i] < 0)
1964         return Node;
1965       SDValue &Src = Ops[OperandIdx[i] - 1];
1966       SDValue &Neg = Ops[NegIdx[i] - 1];
1967       SDValue FakeAbs;
1968       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
1969       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1970       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1971       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
1972       if (HasDst) {
1973         SelIdx--;
1974         ImmIdx--;
1975       }
1976       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1977       SDValue &Imm = Ops[ImmIdx];
1978       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
1979         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1980     }
1981   }
1982
1983   return Node;
1984 }