lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/CallingConvLower.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24 #include "llvm/IR/Argument.h"
  25 #include "llvm/IR/Function.h"
  26
  27 using namespace llvm;
  28
  29 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  30     AMDGPUTargetLowering(TM),
  31     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  36   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  37   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  38
  39   computeRegisterProperties();
  40
  41   // Set condition code actions
  42   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  43   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  44   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  45   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  46   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  54
  55   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  56   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  58   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  59
  60   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  61   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  62
  63   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  64   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  65
  66   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  67   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  68
  69   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  70
  71   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  72   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  73   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  74
  75   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  76   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  77
  78   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  79   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  80   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  81
  82   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  83   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  84   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  85   setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
  86   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  87   setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
  88
  89   // Legalize loads and stores to the private address space.
  90   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  91   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  92   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  93   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
  94   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
  95   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  96   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
  97   setOperationAction(ISD::STORE, MVT::i8, Custom);
  98   setOperationAction(ISD::STORE, MVT::i32, Custom);
  99   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 100   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 101   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 102   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 103
 104   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 105   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 106   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 107
 108   setTargetDAGCombine(ISD::FP_ROUND);
 109   setTargetDAGCombine(ISD::FP_TO_SINT);
 110   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 111   setTargetDAGCombine(ISD::SELECT_CC);
 112   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 113
 114   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 115
 116   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 117   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 118   setSchedulingPreference(Sched::Source);
 119 }
 120
 121 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 122     MachineInstr * MI, MachineBasicBlock * BB) const {
 123   MachineFunction * MF = BB->getParent();
 124   MachineRegisterInfo &MRI = MF->getRegInfo();
 125   MachineBasicBlock::iterator I = *MI;
 126   const R600InstrInfo *TII =
 127     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 128
 129   switch (MI->getOpcode()) {
 130   default:
 131     if (TII->isLDSInstr(MI->getOpcode()) &&
 132         TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst) != -1) {
 133       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 134       assert(DstIdx != -1);
 135       MachineInstrBuilder NewMI;
 136       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) {
 137         NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()),
 138                         AMDGPU::OQAP);
 139         TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
 140                                      MI->getOperand(0).getReg(),
 141                                      AMDGPU::OQAP);
 142       } else {
 143         NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 144                         TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 145       }
 146       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 147         NewMI.addOperand(MI->getOperand(i));
 148       }
 149     } else {
 150       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 151     }
 152     break;
 153   case AMDGPU::CLAMP_R600: {
 154     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 155                                                    AMDGPU::MOV,
 156                                                    MI->getOperand(0).getReg(),
 157                                                    MI->getOperand(1).getReg());
 158     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 159     break;
 160   }
 161
 162   case AMDGPU::FABS_R600: {
 163     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 164                                                     AMDGPU::MOV,
 165                                                     MI->getOperand(0).getReg(),
 166                                                     MI->getOperand(1).getReg());
 167     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 168     break;
 169   }
 170
 171   case AMDGPU::FNEG_R600: {
 172     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 173                                                     AMDGPU::MOV,
 174                                                     MI->getOperand(0).getReg(),
 175                                                     MI->getOperand(1).getReg());
 176     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 177     break;
 178   }
 179
 180   case AMDGPU::MASK_WRITE: {
 181     unsigned maskedRegister = MI->getOperand(0).getReg();
 182     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 183     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 184     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 185     break;
 186   }
 187
 188   case AMDGPU::MOV_IMM_F32:
 189     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 190                      MI->getOperand(1).getFPImm()->getValueAPF()
 191                          .bitcastToAPInt().getZExtValue());
 192     break;
 193   case AMDGPU::MOV_IMM_I32:
 194     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 195                      MI->getOperand(1).getImm());
 196     break;
 197   case AMDGPU::CONST_COPY: {
 198     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 199         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 200     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 201         MI->getOperand(1).getImm());
 202     break;
 203   }
 204
 205   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 206   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 207   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 208     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 209
 210     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 211             .addOperand(MI->getOperand(0))
 212             .addOperand(MI->getOperand(1))
 213             .addImm(EOP); // Set End of program bit
 214     break;
 215   }
 216
 217   case AMDGPU::TXD: {
 218     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 219     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 220     MachineOperand &RID = MI->getOperand(4);
 221     MachineOperand &SID = MI->getOperand(5);
 222     unsigned TextureId = MI->getOperand(6).getImm();
 223     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 224     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 225
 226     switch (TextureId) {
 227     case 5: // Rect
 228       CTX = CTY = 0;
 229       break;
 230     case 6: // Shadow1D
 231       SrcW = SrcZ;
 232       break;
 233     case 7: // Shadow2D
 234       SrcW = SrcZ;
 235       break;
 236     case 8: // ShadowRect
 237       CTX = CTY = 0;
 238       SrcW = SrcZ;
 239       break;
 240     case 9: // 1DArray
 241       SrcZ = SrcY;
 242       CTZ = 0;
 243       break;
 244     case 10: // 2DArray
 245       CTZ = 0;
 246       break;
 247     case 11: // Shadow1DArray
 248       SrcZ = SrcY;
 249       CTZ = 0;
 250       break;
 251     case 12: // Shadow2DArray
 252       CTZ = 0;
 253       break;
 254     }
 255     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 256             .addOperand(MI->getOperand(3))
 257             .addImm(SrcX)
 258             .addImm(SrcY)
 259             .addImm(SrcZ)
 260             .addImm(SrcW)
 261             .addImm(0)
 262             .addImm(0)
 263             .addImm(0)
 264             .addImm(0)
 265             .addImm(1)
 266             .addImm(2)
 267             .addImm(3)
 268             .addOperand(RID)
 269             .addOperand(SID)
 270             .addImm(CTX)
 271             .addImm(CTY)
 272             .addImm(CTZ)
 273             .addImm(CTW);
 274     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 275             .addOperand(MI->getOperand(2))
 276             .addImm(SrcX)
 277             .addImm(SrcY)
 278             .addImm(SrcZ)
 279             .addImm(SrcW)
 280             .addImm(0)
 281             .addImm(0)
 282             .addImm(0)
 283             .addImm(0)
 284             .addImm(1)
 285             .addImm(2)
 286             .addImm(3)
 287             .addOperand(RID)
 288             .addOperand(SID)
 289             .addImm(CTX)
 290             .addImm(CTY)
 291             .addImm(CTZ)
 292             .addImm(CTW);
 293     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 294             .addOperand(MI->getOperand(0))
 295             .addOperand(MI->getOperand(1))
 296             .addImm(SrcX)
 297             .addImm(SrcY)
 298             .addImm(SrcZ)
 299             .addImm(SrcW)
 300             .addImm(0)
 301             .addImm(0)
 302             .addImm(0)
 303             .addImm(0)
 304             .addImm(1)
 305             .addImm(2)
 306             .addImm(3)
 307             .addOperand(RID)
 308             .addOperand(SID)
 309             .addImm(CTX)
 310             .addImm(CTY)
 311             .addImm(CTZ)
 312             .addImm(CTW)
 313             .addReg(T0, RegState::Implicit)
 314             .addReg(T1, RegState::Implicit);
 315     break;
 316   }
 317
 318   case AMDGPU::TXD_SHADOW: {
 319     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 320     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 321     MachineOperand &RID = MI->getOperand(4);
 322     MachineOperand &SID = MI->getOperand(5);
 323     unsigned TextureId = MI->getOperand(6).getImm();
 324     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 325     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 326
 327     switch (TextureId) {
 328     case 5: // Rect
 329       CTX = CTY = 0;
 330       break;
 331     case 6: // Shadow1D
 332       SrcW = SrcZ;
 333       break;
 334     case 7: // Shadow2D
 335       SrcW = SrcZ;
 336       break;
 337     case 8: // ShadowRect
 338       CTX = CTY = 0;
 339       SrcW = SrcZ;
 340       break;
 341     case 9: // 1DArray
 342       SrcZ = SrcY;
 343       CTZ = 0;
 344       break;
 345     case 10: // 2DArray
 346       CTZ = 0;
 347       break;
 348     case 11: // Shadow1DArray
 349       SrcZ = SrcY;
 350       CTZ = 0;
 351       break;
 352     case 12: // Shadow2DArray
 353       CTZ = 0;
 354       break;
 355     }
 356
 357     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 358             .addOperand(MI->getOperand(3))
 359             .addImm(SrcX)
 360             .addImm(SrcY)
 361             .addImm(SrcZ)
 362             .addImm(SrcW)
 363             .addImm(0)
 364             .addImm(0)
 365             .addImm(0)
 366             .addImm(0)
 367             .addImm(1)
 368             .addImm(2)
 369             .addImm(3)
 370             .addOperand(RID)
 371             .addOperand(SID)
 372             .addImm(CTX)
 373             .addImm(CTY)
 374             .addImm(CTZ)
 375             .addImm(CTW);
 376     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 377             .addOperand(MI->getOperand(2))
 378             .addImm(SrcX)
 379             .addImm(SrcY)
 380             .addImm(SrcZ)
 381             .addImm(SrcW)
 382             .addImm(0)
 383             .addImm(0)
 384             .addImm(0)
 385             .addImm(0)
 386             .addImm(1)
 387             .addImm(2)
 388             .addImm(3)
 389             .addOperand(RID)
 390             .addOperand(SID)
 391             .addImm(CTX)
 392             .addImm(CTY)
 393             .addImm(CTZ)
 394             .addImm(CTW);
 395     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 396             .addOperand(MI->getOperand(0))
 397             .addOperand(MI->getOperand(1))
 398             .addImm(SrcX)
 399             .addImm(SrcY)
 400             .addImm(SrcZ)
 401             .addImm(SrcW)
 402             .addImm(0)
 403             .addImm(0)
 404             .addImm(0)
 405             .addImm(0)
 406             .addImm(1)
 407             .addImm(2)
 408             .addImm(3)
 409             .addOperand(RID)
 410             .addOperand(SID)
 411             .addImm(CTX)
 412             .addImm(CTY)
 413             .addImm(CTZ)
 414             .addImm(CTW)
 415             .addReg(T0, RegState::Implicit)
 416             .addReg(T1, RegState::Implicit);
 417     break;
 418   }
 419
 420   case AMDGPU::BRANCH:
 421       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 422               .addOperand(MI->getOperand(0));
 423       break;
 424
 425   case AMDGPU::BRANCH_COND_f32: {
 426     MachineInstr *NewMI =
 427       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 428               AMDGPU::PREDICATE_BIT)
 429               .addOperand(MI->getOperand(1))
 430               .addImm(OPCODE_IS_NOT_ZERO)
 431               .addImm(0); // Flags
 432     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 433     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 434             .addOperand(MI->getOperand(0))
 435             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 436     break;
 437   }
 438
 439   case AMDGPU::BRANCH_COND_i32: {
 440     MachineInstr *NewMI =
 441       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 442             AMDGPU::PREDICATE_BIT)
 443             .addOperand(MI->getOperand(1))
 444             .addImm(OPCODE_IS_NOT_ZERO_INT)
 445             .addImm(0); // Flags
 446     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 447     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 448            .addOperand(MI->getOperand(0))
 449             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 450     break;
 451   }
 452
 453   case AMDGPU::EG_ExportSwz:
 454   case AMDGPU::R600_ExportSwz: {
 455     // Instruction is left unmodified if its not the last one of its type
 456     bool isLastInstructionOfItsType = true;
 457     unsigned InstExportType = MI->getOperand(1).getImm();
 458     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 459          EndBlock = BB->end(); NextExportInst != EndBlock;
 460          NextExportInst = llvm::next(NextExportInst)) {
 461       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 462           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 463         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 464             .getImm();
 465         if (CurrentInstExportType == InstExportType) {
 466           isLastInstructionOfItsType = false;
 467           break;
 468         }
 469       }
 470     }
 471     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 472     if (!EOP && !isLastInstructionOfItsType)
 473       return BB;
 474     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 475     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 476             .addOperand(MI->getOperand(0))
 477             .addOperand(MI->getOperand(1))
 478             .addOperand(MI->getOperand(2))
 479             .addOperand(MI->getOperand(3))
 480             .addOperand(MI->getOperand(4))
 481             .addOperand(MI->getOperand(5))
 482             .addOperand(MI->getOperand(6))
 483             .addImm(CfInst)
 484             .addImm(EOP);
 485     break;
 486   }
 487   case AMDGPU::RETURN: {
 488     // RETURN instructions must have the live-out registers as implicit uses,
 489     // otherwise they appear dead.
 490     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 491     MachineInstrBuilder MIB(*MF, MI);
 492     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 493       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 494     return BB;
 495   }
 496   }
 497
 498   MI->eraseFromParent();
 499   return BB;
 500 }
 501
 502 //===----------------------------------------------------------------------===//
 503 // Custom DAG Lowering Operations
 504 //===----------------------------------------------------------------------===//
 505
 506 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 507   MachineFunction &MF = DAG.getMachineFunction();
 508   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 509   switch (Op.getOpcode()) {
 510   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 511   case ISD::FCOS:
 512   case ISD::FSIN: return LowerTrig(Op, DAG);
 513   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 514   case ISD::STORE: return LowerSTORE(Op, DAG);
 515   case ISD::LOAD: return LowerLOAD(Op, DAG);
 516   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 517   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 518   case ISD::INTRINSIC_VOID: {
 519     SDValue Chain = Op.getOperand(0);
 520     unsigned IntrinsicID =
 521                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 522     switch (IntrinsicID) {
 523     case AMDGPUIntrinsic::AMDGPU_store_output: {
 524       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 525       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 526       MFI->LiveOuts.push_back(Reg);
 527       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 528     }
 529     case AMDGPUIntrinsic::R600_store_swizzle: {
 530       const SDValue Args[8] = {
 531         Chain,
 532         Op.getOperand(2), // Export Value
 533         Op.getOperand(3), // ArrayBase
 534         Op.getOperand(4), // Type
 535         DAG.getConstant(0, MVT::i32), // SWZ_X
 536         DAG.getConstant(1, MVT::i32), // SWZ_Y
 537         DAG.getConstant(2, MVT::i32), // SWZ_Z
 538         DAG.getConstant(3, MVT::i32) // SWZ_W
 539       };
 540       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 541           Args, 8);
 542     }
 543
 544     // default for switch(IntrinsicID)
 545     default: break;
 546     }
 547     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 548     break;
 549   }
 550   case ISD::INTRINSIC_WO_CHAIN: {
 551     unsigned IntrinsicID =
 552                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 553     EVT VT = Op.getValueType();
 554     SDLoc DL(Op);
 555     switch(IntrinsicID) {
 556     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 557     case AMDGPUIntrinsic::R600_load_input: {
 558       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 559       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 560       MachineFunction &MF = DAG.getMachineFunction();
 561       MachineRegisterInfo &MRI = MF.getRegInfo();
 562       MRI.addLiveIn(Reg);
 563       return DAG.getCopyFromReg(DAG.getEntryNode(),
 564           SDLoc(DAG.getEntryNode()), Reg, VT);
 565     }
 566
 567     case AMDGPUIntrinsic::R600_interp_input: {
 568       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 569       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 570       MachineSDNode *interp;
 571       if (ijb < 0) {
 572         const MachineFunction &MF = DAG.getMachineFunction();
 573         const R600InstrInfo *TII =
 574           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 575         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 576             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 577         return DAG.getTargetExtractSubreg(
 578             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 579             DL, MVT::f32, SDValue(interp, 0));
 580       }
 581       MachineFunction &MF = DAG.getMachineFunction();
 582       MachineRegisterInfo &MRI = MF.getRegInfo();
 583       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 584       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 585       MRI.addLiveIn(RegisterI);
 586       MRI.addLiveIn(RegisterJ);
 587       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 588           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 589       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 590           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 591
 592       if (slot % 4 < 2)
 593         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 594             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 595             RegisterJNode, RegisterINode);
 596       else
 597         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 598             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 599             RegisterJNode, RegisterINode);
 600       return SDValue(interp, slot % 2);
 601     }
 602     case AMDGPUIntrinsic::R600_interp_xy:
 603     case AMDGPUIntrinsic::R600_interp_zw: {
 604       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 605       MachineSDNode *interp;
 606       SDValue RegisterINode = Op.getOperand(2);
 607       SDValue RegisterJNode = Op.getOperand(3);
 608
 609       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 610         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 611             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 612             RegisterJNode, RegisterINode);
 613       else
 614         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 615             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 616             RegisterJNode, RegisterINode);
 617       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 618           SDValue(interp, 0), SDValue(interp, 1));
 619     }
 620     case AMDGPUIntrinsic::R600_tex:
 621     case AMDGPUIntrinsic::R600_texc:
 622     case AMDGPUIntrinsic::R600_txl:
 623     case AMDGPUIntrinsic::R600_txlc:
 624     case AMDGPUIntrinsic::R600_txb:
 625     case AMDGPUIntrinsic::R600_txbc:
 626     case AMDGPUIntrinsic::R600_txf:
 627     case AMDGPUIntrinsic::R600_txq:
 628     case AMDGPUIntrinsic::R600_ddx:
 629     case AMDGPUIntrinsic::R600_ddy:
 630     case AMDGPUIntrinsic::R600_ldptr: {
 631       unsigned TextureOp;
 632       switch (IntrinsicID) {
 633       case AMDGPUIntrinsic::R600_tex:
 634         TextureOp = 0;
 635         break;
 636       case AMDGPUIntrinsic::R600_texc:
 637         TextureOp = 1;
 638         break;
 639       case AMDGPUIntrinsic::R600_txl:
 640         TextureOp = 2;
 641         break;
 642       case AMDGPUIntrinsic::R600_txlc:
 643         TextureOp = 3;
 644         break;
 645       case AMDGPUIntrinsic::R600_txb:
 646         TextureOp = 4;
 647         break;
 648       case AMDGPUIntrinsic::R600_txbc:
 649         TextureOp = 5;
 650         break;
 651       case AMDGPUIntrinsic::R600_txf:
 652         TextureOp = 6;
 653         break;
 654       case AMDGPUIntrinsic::R600_txq:
 655         TextureOp = 7;
 656         break;
 657       case AMDGPUIntrinsic::R600_ddx:
 658         TextureOp = 8;
 659         break;
 660       case AMDGPUIntrinsic::R600_ddy:
 661         TextureOp = 9;
 662         break;
 663       case AMDGPUIntrinsic::R600_ldptr:
 664         TextureOp = 10;
 665         break;
 666       default:
 667         llvm_unreachable("Unknow Texture Operation");
 668       }
 669
 670       SDValue TexArgs[19] = {
 671         DAG.getConstant(TextureOp, MVT::i32),
 672         Op.getOperand(1),
 673         DAG.getConstant(0, MVT::i32),
 674         DAG.getConstant(1, MVT::i32),
 675         DAG.getConstant(2, MVT::i32),
 676         DAG.getConstant(3, MVT::i32),
 677         Op.getOperand(2),
 678         Op.getOperand(3),
 679         Op.getOperand(4),
 680         DAG.getConstant(0, MVT::i32),
 681         DAG.getConstant(1, MVT::i32),
 682         DAG.getConstant(2, MVT::i32),
 683         DAG.getConstant(3, MVT::i32),
 684         Op.getOperand(5),
 685         Op.getOperand(6),
 686         Op.getOperand(7),
 687         Op.getOperand(8),
 688         Op.getOperand(9),
 689         Op.getOperand(10)
 690       };
 691       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 692     }
 693     case AMDGPUIntrinsic::AMDGPU_dp4: {
 694       SDValue Args[8] = {
 695       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 696           DAG.getConstant(0, MVT::i32)),
 697       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 698           DAG.getConstant(0, MVT::i32)),
 699       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 700           DAG.getConstant(1, MVT::i32)),
 701       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 702           DAG.getConstant(1, MVT::i32)),
 703       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 704           DAG.getConstant(2, MVT::i32)),
 705       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 706           DAG.getConstant(2, MVT::i32)),
 707       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 708           DAG.getConstant(3, MVT::i32)),
 709       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 710           DAG.getConstant(3, MVT::i32))
 711       };
 712       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 713     }
 714
 715     case Intrinsic::r600_read_ngroups_x:
 716       return LowerImplicitParameter(DAG, VT, DL, 0);
 717     case Intrinsic::r600_read_ngroups_y:
 718       return LowerImplicitParameter(DAG, VT, DL, 1);
 719     case Intrinsic::r600_read_ngroups_z:
 720       return LowerImplicitParameter(DAG, VT, DL, 2);
 721     case Intrinsic::r600_read_global_size_x:
 722       return LowerImplicitParameter(DAG, VT, DL, 3);
 723     case Intrinsic::r600_read_global_size_y:
 724       return LowerImplicitParameter(DAG, VT, DL, 4);
 725     case Intrinsic::r600_read_global_size_z:
 726       return LowerImplicitParameter(DAG, VT, DL, 5);
 727     case Intrinsic::r600_read_local_size_x:
 728       return LowerImplicitParameter(DAG, VT, DL, 6);
 729     case Intrinsic::r600_read_local_size_y:
 730       return LowerImplicitParameter(DAG, VT, DL, 7);
 731     case Intrinsic::r600_read_local_size_z:
 732       return LowerImplicitParameter(DAG, VT, DL, 8);
 733
 734     case Intrinsic::r600_read_tgid_x:
 735       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 736                                   AMDGPU::T1_X, VT);
 737     case Intrinsic::r600_read_tgid_y:
 738       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 739                                   AMDGPU::T1_Y, VT);
 740     case Intrinsic::r600_read_tgid_z:
 741       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 742                                   AMDGPU::T1_Z, VT);
 743     case Intrinsic::r600_read_tidig_x:
 744       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 745                                   AMDGPU::T0_X, VT);
 746     case Intrinsic::r600_read_tidig_y:
 747       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 748                                   AMDGPU::T0_Y, VT);
 749     case Intrinsic::r600_read_tidig_z:
 750       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 751                                   AMDGPU::T0_Z, VT);
 752     }
 753     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 754     break;
 755   }
 756   } // end switch(Op.getOpcode())
 757   return SDValue();
 758 }
 759
 760 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 761                                             SmallVectorImpl<SDValue> &Results,
 762                                             SelectionDAG &DAG) const {
 763   switch (N->getOpcode()) {
 764   default: return;
 765   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 766     return;
 767   case ISD::LOAD: {
 768     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 769     Results.push_back(SDValue(Node, 0));
 770     Results.push_back(SDValue(Node, 1));
 771     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 772     // function
 773     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 774     return;
 775   }
 776   case ISD::STORE:
 777     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 778     Results.push_back(SDValue(Node, 0));
 779     return;
 780   }
 781 }
 782
 783 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 784   // On hw >= R700, COS/SIN input must be between -1. and 1.
 785   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 786   EVT VT = Op.getValueType();
 787   SDValue Arg = Op.getOperand(0);
 788   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 789       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 790         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 791           DAG.getConstantFP(0.15915494309, MVT::f32)),
 792         DAG.getConstantFP(0.5, MVT::f32)));
 793   unsigned TrigNode;
 794   switch (Op.getOpcode()) {
 795   case ISD::FCOS:
 796     TrigNode = AMDGPUISD::COS_HW;
 797     break;
 798   case ISD::FSIN:
 799     TrigNode = AMDGPUISD::SIN_HW;
 800     break;
 801   default:
 802     llvm_unreachable("Wrong trig opcode");
 803   }
 804   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 805       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 806         DAG.getConstantFP(-0.5, MVT::f32)));
 807   if (Gen >= AMDGPUSubtarget::R700)
 808     return TrigVal;
 809   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 810   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 811       DAG.getConstantFP(3.14159265359, MVT::f32));
 812 }
 813
 814 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 815   return DAG.getNode(
 816       ISD::SETCC,
 817       SDLoc(Op),
 818       MVT::i1,
 819       Op, DAG.getConstantFP(0.0f, MVT::f32),
 820       DAG.getCondCode(ISD::SETNE)
 821       );
 822 }
 823
 824 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 825                                                    SDLoc DL,
 826                                                    unsigned DwordOffset) const {
 827   unsigned ByteOffset = DwordOffset * 4;
 828   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 829                                       AMDGPUAS::CONSTANT_BUFFER_0);
 830
 831   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 832   assert(isInt<16>(ByteOffset));
 833
 834   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 835                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 836                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 837                      false, false, false, 0);
 838 }
 839
 840 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 841
 842   MachineFunction &MF = DAG.getMachineFunction();
 843   const AMDGPUFrameLowering *TFL =
 844    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 845
 846   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 847   assert(FIN);
 848
 849   unsigned FrameIndex = FIN->getIndex();
 850   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 851   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 852 }
 853
 854 bool R600TargetLowering::isZero(SDValue Op) const {
 855   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 856     return Cst->isNullValue();
 857   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 858     return CstFP->isZero();
 859   } else {
 860     return false;
 861   }
 862 }
 863
 864 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 865   SDLoc DL(Op);
 866   EVT VT = Op.getValueType();
 867
 868   SDValue LHS = Op.getOperand(0);
 869   SDValue RHS = Op.getOperand(1);
 870   SDValue True = Op.getOperand(2);
 871   SDValue False = Op.getOperand(3);
 872   SDValue CC = Op.getOperand(4);
 873   SDValue Temp;
 874
 875   // LHS and RHS are guaranteed to be the same value type
 876   EVT CompareVT = LHS.getValueType();
 877
 878   // Check if we can lower this to a native operation.
 879
 880   // Try to lower to a SET* instruction:
 881   //
 882   // SET* can match the following patterns:
 883   //
 884   // select_cc f32, f32, -1,  0, cc_supported
 885   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
 886   // select_cc i32, i32, -1,  0, cc_supported
 887   //
 888
 889   // Move hardware True/False values to the correct operand.
 890   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 891   ISD::CondCode InverseCC =
 892      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 893   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 894     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
 895       std::swap(False, True);
 896       CC = DAG.getCondCode(InverseCC);
 897     } else {
 898       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
 899       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
 900         std::swap(False, True);
 901         std::swap(LHS, RHS);
 902         CC = DAG.getCondCode(SwapInvCC);
 903       }
 904     }
 905   }
 906
 907   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 908       (CompareVT == VT || VT == MVT::i32)) {
 909     // This can be matched by a SET* instruction.
 910     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 911   }
 912
 913   // Try to lower to a CND* instruction:
 914   //
 915   // CND* can match the following patterns:
 916   //
 917   // select_cc f32, 0.0, f32, f32, cc_supported
 918   // select_cc f32, 0.0, i32, i32, cc_supported
 919   // select_cc i32, 0,   f32, f32, cc_supported
 920   // select_cc i32, 0,   i32, i32, cc_supported
 921   //
 922
 923   // Try to move the zero value to the RHS
 924   if (isZero(LHS)) {
 925     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 926     // Try swapping the operands
 927     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
 928     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
 929       std::swap(LHS, RHS);
 930       CC = DAG.getCondCode(CCSwapped);
 931     } else {
 932       // Try inverting the conditon and then swapping the operands
 933       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
 934       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
 935       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
 936         std::swap(True, False);
 937         std::swap(LHS, RHS);
 938         CC = DAG.getCondCode(CCSwapped);
 939       }
 940     }
 941   }
 942   if (isZero(RHS)) {
 943     SDValue Cond = LHS;
 944     SDValue Zero = RHS;
 945     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 946     if (CompareVT != VT) {
 947       // Bitcast True / False to the correct types.  This will end up being
 948       // a nop, but it allows us to define only a single pattern in the
 949       // .TD files for each CND* instruction rather than having to have
 950       // one pattern for integer True/False and one for fp True/False
 951       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 952       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 953     }
 954
 955     switch (CCOpcode) {
 956     case ISD::SETONE:
 957     case ISD::SETUNE:
 958     case ISD::SETNE:
 959       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 960       Temp = True;
 961       True = False;
 962       False = Temp;
 963       break;
 964     default:
 965       break;
 966     }
 967     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 968         Cond, Zero,
 969         True, False,
 970         DAG.getCondCode(CCOpcode));
 971     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 972   }
 973
 974
 975   // Possible Min/Max pattern
 976   SDValue MinMax = LowerMinMax(Op, DAG);
 977   if (MinMax.getNode()) {
 978     return MinMax;
 979   }
 980
 981   // If we make it this for it means we have no native instructions to handle
 982   // this SELECT_CC, so we must lower it.
 983   SDValue HWTrue, HWFalse;
 984
 985   if (CompareVT == MVT::f32) {
 986     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 987     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 988   } else if (CompareVT == MVT::i32) {
 989     HWTrue = DAG.getConstant(-1, CompareVT);
 990     HWFalse = DAG.getConstant(0, CompareVT);
 991   }
 992   else {
 993     assert(!"Unhandled value type in LowerSELECT_CC");
 994   }
 995
 996   // Lower this unsupported SELECT_CC into a combination of two supported
 997   // SELECT_CC operations.
 998   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 999
1000   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1001       Cond, HWFalse,
1002       True, False,
1003       DAG.getCondCode(ISD::SETNE));
1004 }
1005
1006 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
1007 /// convert these pointers to a register index.  Each register holds
1008 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1009 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1010 /// for indirect addressing.
1011 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1012                                                unsigned StackWidth,
1013                                                SelectionDAG &DAG) const {
1014   unsigned SRLPad;
1015   switch(StackWidth) {
1016   case 1:
1017     SRLPad = 2;
1018     break;
1019   case 2:
1020     SRLPad = 3;
1021     break;
1022   case 4:
1023     SRLPad = 4;
1024     break;
1025   default: llvm_unreachable("Invalid stack width");
1026   }
1027
1028   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1029                      DAG.getConstant(SRLPad, MVT::i32));
1030 }
1031
1032 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1033                                          unsigned ElemIdx,
1034                                          unsigned &Channel,
1035                                          unsigned &PtrIncr) const {
1036   switch (StackWidth) {
1037   default:
1038   case 1:
1039     Channel = 0;
1040     if (ElemIdx > 0) {
1041       PtrIncr = 1;
1042     } else {
1043       PtrIncr = 0;
1044     }
1045     break;
1046   case 2:
1047     Channel = ElemIdx % 2;
1048     if (ElemIdx == 2) {
1049       PtrIncr = 1;
1050     } else {
1051       PtrIncr = 0;
1052     }
1053     break;
1054   case 4:
1055     Channel = ElemIdx;
1056     PtrIncr = 0;
1057     break;
1058   }
1059 }
1060
1061 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1062   SDLoc DL(Op);
1063   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1064   SDValue Chain = Op.getOperand(0);
1065   SDValue Value = Op.getOperand(1);
1066   SDValue Ptr = Op.getOperand(2);
1067
1068   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1069   if (Result.getNode()) {
1070     return Result;
1071   }
1072
1073   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1074     if (StoreNode->isTruncatingStore()) {
1075       EVT VT = Value.getValueType();
1076       assert(VT.bitsLE(MVT::i32));
1077       EVT MemVT = StoreNode->getMemoryVT();
1078       SDValue MaskConstant;
1079       if (MemVT == MVT::i8) {
1080         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1081       } else {
1082         assert(MemVT == MVT::i16);
1083         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1084       }
1085       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1086                                       DAG.getConstant(2, MVT::i32));
1087       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1088                                       DAG.getConstant(0x00000003, VT));
1089       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1090       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1091                                    DAG.getConstant(3, VT));
1092       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1093       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1094       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1095       // vector instead.
1096       SDValue Src[4] = {
1097         ShiftedValue,
1098         DAG.getConstant(0, MVT::i32),
1099         DAG.getConstant(0, MVT::i32),
1100         Mask
1101       };
1102       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1103       SDValue Args[3] = { Chain, Input, DWordAddr };
1104       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1105                                      Op->getVTList(), Args, 3, MemVT,
1106                                      StoreNode->getMemOperand());
1107     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1108                Value.getValueType().bitsGE(MVT::i32)) {
1109       // Convert pointer from byte address to dword address.
1110       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1111                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1112                                     Ptr, DAG.getConstant(2, MVT::i32)));
1113
1114       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1115         assert(!"Truncated and indexed stores not supported yet");
1116       } else {
1117         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1118       }
1119       return Chain;
1120     }
1121   }
1122
1123   EVT ValueVT = Value.getValueType();
1124
1125   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1126     return SDValue();
1127   }
1128
1129   // Lowering for indirect addressing
1130
1131   const MachineFunction &MF = DAG.getMachineFunction();
1132   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1133                                          getTargetMachine().getFrameLowering());
1134   unsigned StackWidth = TFL->getStackWidth(MF);
1135
1136   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1137
1138   if (ValueVT.isVector()) {
1139     unsigned NumElemVT = ValueVT.getVectorNumElements();
1140     EVT ElemVT = ValueVT.getVectorElementType();
1141     SDValue Stores[4];
1142
1143     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1144                                       "vector width in load");
1145
1146     for (unsigned i = 0; i < NumElemVT; ++i) {
1147       unsigned Channel, PtrIncr;
1148       getStackAddress(StackWidth, i, Channel, PtrIncr);
1149       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1150                         DAG.getConstant(PtrIncr, MVT::i32));
1151       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1152                                  Value, DAG.getConstant(i, MVT::i32));
1153
1154       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1155                               Chain, Elem, Ptr,
1156                               DAG.getTargetConstant(Channel, MVT::i32));
1157     }
1158      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1159    } else {
1160     if (ValueVT == MVT::i8) {
1161       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1162     }
1163     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1164     DAG.getTargetConstant(0, MVT::i32)); // Channel
1165   }
1166
1167   return Chain;
1168 }
1169
1170 // return (512 + (kc_bank << 12)
1171 static int
1172 ConstantAddressBlock(unsigned AddressSpace) {
1173   switch (AddressSpace) {
1174   case AMDGPUAS::CONSTANT_BUFFER_0:
1175     return 512;
1176   case AMDGPUAS::CONSTANT_BUFFER_1:
1177     return 512 + 4096;
1178   case AMDGPUAS::CONSTANT_BUFFER_2:
1179     return 512 + 4096 * 2;
1180   case AMDGPUAS::CONSTANT_BUFFER_3:
1181     return 512 + 4096 * 3;
1182   case AMDGPUAS::CONSTANT_BUFFER_4:
1183     return 512 + 4096 * 4;
1184   case AMDGPUAS::CONSTANT_BUFFER_5:
1185     return 512 + 4096 * 5;
1186   case AMDGPUAS::CONSTANT_BUFFER_6:
1187     return 512 + 4096 * 6;
1188   case AMDGPUAS::CONSTANT_BUFFER_7:
1189     return 512 + 4096 * 7;
1190   case AMDGPUAS::CONSTANT_BUFFER_8:
1191     return 512 + 4096 * 8;
1192   case AMDGPUAS::CONSTANT_BUFFER_9:
1193     return 512 + 4096 * 9;
1194   case AMDGPUAS::CONSTANT_BUFFER_10:
1195     return 512 + 4096 * 10;
1196   case AMDGPUAS::CONSTANT_BUFFER_11:
1197     return 512 + 4096 * 11;
1198   case AMDGPUAS::CONSTANT_BUFFER_12:
1199     return 512 + 4096 * 12;
1200   case AMDGPUAS::CONSTANT_BUFFER_13:
1201     return 512 + 4096 * 13;
1202   case AMDGPUAS::CONSTANT_BUFFER_14:
1203     return 512 + 4096 * 14;
1204   case AMDGPUAS::CONSTANT_BUFFER_15:
1205     return 512 + 4096 * 15;
1206   default:
1207     return -1;
1208   }
1209 }
1210
1211 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1212 {
1213   EVT VT = Op.getValueType();
1214   SDLoc DL(Op);
1215   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1216   SDValue Chain = Op.getOperand(0);
1217   SDValue Ptr = Op.getOperand(1);
1218   SDValue LoweredLoad;
1219
1220   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1221     SDValue MergedValues[2] = {
1222       SplitVectorLoad(Op, DAG),
1223       Chain
1224     };
1225     return DAG.getMergeValues(MergedValues, 2, DL);
1226   }
1227
1228   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1229   if (ConstantBlock > -1 && LoadNode->getExtensionType() != ISD::SEXTLOAD) {
1230     SDValue Result;
1231     if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
1232         isa<Constant>(LoadNode->getSrcValue()) ||
1233         isa<ConstantSDNode>(Ptr)) {
1234       SDValue Slots[4];
1235       for (unsigned i = 0; i < 4; i++) {
1236         // We want Const position encoded with the following formula :
1237         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1238         // const_index is Ptr computed by llvm using an alignment of 16.
1239         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1240         // then div by 4 at the ISel step
1241         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1242             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1243         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1244       }
1245       EVT NewVT = MVT::v4i32;
1246       unsigned NumElements = 4;
1247       if (VT.isVector()) {
1248         NewVT = VT;
1249         NumElements = VT.getVectorNumElements();
1250       }
1251       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1252     } else {
1253       // non constant ptr cant be folded, keeps it as a v4f32 load
1254       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1255           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1256           DAG.getConstant(LoadNode->getAddressSpace() -
1257                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1258           );
1259     }
1260
1261     if (!VT.isVector()) {
1262       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1263           DAG.getConstant(0, MVT::i32));
1264     }
1265
1266     SDValue MergedValues[2] = {
1267         Result,
1268         Chain
1269     };
1270     return DAG.getMergeValues(MergedValues, 2, DL);
1271   }
1272
1273   // For most operations returning SDValue() will result in the node being
1274   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1275   // need to manually expand loads that may be legal in some address spaces and
1276   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1277   // compute shaders, since the data is sign extended when it is uploaded to the
1278   // buffer. However SEXT loads from other address spaces are not supported, so
1279   // we need to expand them here.
1280   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1281     EVT MemVT = LoadNode->getMemoryVT();
1282     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1283     SDValue ShiftAmount =
1284           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1285     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1286                                   LoadNode->getPointerInfo(), MemVT,
1287                                   LoadNode->isVolatile(),
1288                                   LoadNode->isNonTemporal(),
1289                                   LoadNode->getAlignment());
1290     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1291     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1292
1293     SDValue MergedValues[2] = { Sra, Chain };
1294     return DAG.getMergeValues(MergedValues, 2, DL);
1295   }
1296
1297   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1298     return SDValue();
1299   }
1300
1301   // Lowering for indirect addressing
1302   const MachineFunction &MF = DAG.getMachineFunction();
1303   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1304                                          getTargetMachine().getFrameLowering());
1305   unsigned StackWidth = TFL->getStackWidth(MF);
1306
1307   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1308
1309   if (VT.isVector()) {
1310     unsigned NumElemVT = VT.getVectorNumElements();
1311     EVT ElemVT = VT.getVectorElementType();
1312     SDValue Loads[4];
1313
1314     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1315                                       "vector width in load");
1316
1317     for (unsigned i = 0; i < NumElemVT; ++i) {
1318       unsigned Channel, PtrIncr;
1319       getStackAddress(StackWidth, i, Channel, PtrIncr);
1320       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1321                         DAG.getConstant(PtrIncr, MVT::i32));
1322       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1323                              Chain, Ptr,
1324                              DAG.getTargetConstant(Channel, MVT::i32),
1325                              Op.getOperand(2));
1326     }
1327     for (unsigned i = NumElemVT; i < 4; ++i) {
1328       Loads[i] = DAG.getUNDEF(ElemVT);
1329     }
1330     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1331     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1332   } else {
1333     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1334                               Chain, Ptr,
1335                               DAG.getTargetConstant(0, MVT::i32), // Channel
1336                               Op.getOperand(2));
1337   }
1338
1339   SDValue Ops[2];
1340   Ops[0] = LoweredLoad;
1341   Ops[1] = Chain;
1342
1343   return DAG.getMergeValues(Ops, 2, DL);
1344 }
1345
1346 /// XXX Only kernel functions are supported, so we can assume for now that
1347 /// every function is a kernel function, but in the future we should use
1348 /// separate calling conventions for kernel and non-kernel functions.
1349 SDValue R600TargetLowering::LowerFormalArguments(
1350                                       SDValue Chain,
1351                                       CallingConv::ID CallConv,
1352                                       bool isVarArg,
1353                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1354                                       SDLoc DL, SelectionDAG &DAG,
1355                                       SmallVectorImpl<SDValue> &InVals) const {
1356   SmallVector<CCValAssign, 16> ArgLocs;
1357   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1358                  getTargetMachine(), ArgLocs, *DAG.getContext());
1359   MachineFunction &MF = DAG.getMachineFunction();
1360   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1361
1362   SmallVector<ISD::InputArg, 8> LocalIns;
1363
1364   getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
1365                           LocalIns);
1366
1367   AnalyzeFormalArguments(CCInfo, LocalIns);
1368
1369   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1370     CCValAssign &VA = ArgLocs[i];
1371     EVT VT = Ins[i].VT;
1372     EVT MemVT = LocalIns[i].VT;
1373
1374     if (ShaderType != ShaderType::COMPUTE) {
1375       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1376       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1377       InVals.push_back(Register);
1378       continue;
1379     }
1380
1381     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1382                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1383
1384     // The first 36 bytes of the input buffer contains information about
1385     // thread group and global sizes.
1386     SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
1387                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1388                                  MachinePointerInfo(UndefValue::get(PtrTy)),
1389                                  MemVT, false, false, 4);
1390                                  // 4 is the prefered alignment for
1391                                  // the CONSTANT memory space.
1392     InVals.push_back(Arg);
1393   }
1394   return Chain;
1395 }
1396
1397 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1398    if (!VT.isVector()) return MVT::i32;
1399    return VT.changeVectorElementTypeToInteger();
1400 }
1401
1402 static SDValue
1403 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1404                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1405   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1406   assert(RemapSwizzle.empty());
1407   SDValue NewBldVec[4] = {
1408       VectorEntry.getOperand(0),
1409       VectorEntry.getOperand(1),
1410       VectorEntry.getOperand(2),
1411       VectorEntry.getOperand(3)
1412   };
1413
1414   for (unsigned i = 0; i < 4; i++) {
1415     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1416       // We mask write here to teach later passes that the ith element of this
1417       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1418       // break false dependencies and additionnaly make assembly easier to read.
1419       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1420     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1421       if (C->isZero()) {
1422         RemapSwizzle[i] = 4; // SEL_0
1423         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1424       } else if (C->isExactlyValue(1.0)) {
1425         RemapSwizzle[i] = 5; // SEL_1
1426         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1427       }
1428     }
1429
1430     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1431       continue;
1432     for (unsigned j = 0; j < i; j++) {
1433       if (NewBldVec[i] == NewBldVec[j]) {
1434         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1435         RemapSwizzle[i] = j;
1436         break;
1437       }
1438     }
1439   }
1440
1441   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1442       VectorEntry.getValueType(), NewBldVec, 4);
1443 }
1444
1445 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1446                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1447   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1448   assert(RemapSwizzle.empty());
1449   SDValue NewBldVec[4] = {
1450       VectorEntry.getOperand(0),
1451       VectorEntry.getOperand(1),
1452       VectorEntry.getOperand(2),
1453       VectorEntry.getOperand(3)
1454   };
1455   bool isUnmovable[4] = { false, false, false, false };
1456   for (unsigned i = 0; i < 4; i++)
1457     RemapSwizzle[i] = i;
1458
1459   for (unsigned i = 0; i < 4; i++) {
1460     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1461       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1462           ->getZExtValue();
1463       if (i == Idx) {
1464         isUnmovable[Idx] = true;
1465         continue;
1466       }
1467       if (isUnmovable[Idx])
1468         continue;
1469       // Swap i and Idx
1470       std::swap(NewBldVec[Idx], NewBldVec[i]);
1471       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1472       break;
1473     }
1474   }
1475
1476   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1477       VectorEntry.getValueType(), NewBldVec, 4);
1478 }
1479
1480
1481 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1482 SDValue Swz[4], SelectionDAG &DAG) const {
1483   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1484   // Old -> New swizzle values
1485   DenseMap<unsigned, unsigned> SwizzleRemap;
1486
1487   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1488   for (unsigned i = 0; i < 4; i++) {
1489     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1490     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1491       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1492   }
1493
1494   SwizzleRemap.clear();
1495   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1496   for (unsigned i = 0; i < 4; i++) {
1497     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1498     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1499       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1500   }
1501
1502   return BuildVector;
1503 }
1504
1505
1506 //===----------------------------------------------------------------------===//
1507 // Custom DAG Optimizations
1508 //===----------------------------------------------------------------------===//
1509
1510 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1511                                               DAGCombinerInfo &DCI) const {
1512   SelectionDAG &DAG = DCI.DAG;
1513
1514   switch (N->getOpcode()) {
1515   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1516   case ISD::FP_ROUND: {
1517       SDValue Arg = N->getOperand(0);
1518       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1519         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1520                            Arg.getOperand(0));
1521       }
1522       break;
1523     }
1524
1525   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1526   // (i32 select_cc f32, f32, -1, 0 cc)
1527   //
1528   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1529   // this to one of the SET*_DX10 instructions.
1530   case ISD::FP_TO_SINT: {
1531     SDValue FNeg = N->getOperand(0);
1532     if (FNeg.getOpcode() != ISD::FNEG) {
1533       return SDValue();
1534     }
1535     SDValue SelectCC = FNeg.getOperand(0);
1536     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1537         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1538         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1539         !isHWTrueValue(SelectCC.getOperand(2)) ||
1540         !isHWFalseValue(SelectCC.getOperand(3))) {
1541       return SDValue();
1542     }
1543
1544     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1545                            SelectCC.getOperand(0), // LHS
1546                            SelectCC.getOperand(1), // RHS
1547                            DAG.getConstant(-1, MVT::i32), // True
1548                            DAG.getConstant(0, MVT::i32),  // Flase
1549                            SelectCC.getOperand(4)); // CC
1550
1551     break;
1552   }
1553
1554   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1555   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1556   case ISD::INSERT_VECTOR_ELT: {
1557     SDValue InVec = N->getOperand(0);
1558     SDValue InVal = N->getOperand(1);
1559     SDValue EltNo = N->getOperand(2);
1560     SDLoc dl(N);
1561
1562     // If the inserted element is an UNDEF, just use the input vector.
1563     if (InVal.getOpcode() == ISD::UNDEF)
1564       return InVec;
1565
1566     EVT VT = InVec.getValueType();
1567
1568     // If we can't generate a legal BUILD_VECTOR, exit
1569     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1570       return SDValue();
1571
1572     // Check that we know which element is being inserted
1573     if (!isa<ConstantSDNode>(EltNo))
1574       return SDValue();
1575     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1576
1577     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1578     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1579     // vector elements.
1580     SmallVector<SDValue, 8> Ops;
1581     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1582       Ops.append(InVec.getNode()->op_begin(),
1583                  InVec.getNode()->op_end());
1584     } else if (InVec.getOpcode() == ISD::UNDEF) {
1585       unsigned NElts = VT.getVectorNumElements();
1586       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1587     } else {
1588       return SDValue();
1589     }
1590
1591     // Insert the element
1592     if (Elt < Ops.size()) {
1593       // All the operands of BUILD_VECTOR must have the same type;
1594       // we enforce that here.
1595       EVT OpVT = Ops[0].getValueType();
1596       if (InVal.getValueType() != OpVT)
1597         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1598           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1599           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1600       Ops[Elt] = InVal;
1601     }
1602
1603     // Return the new vector
1604     return DAG.getNode(ISD::BUILD_VECTOR, dl,
1605                        VT, &Ops[0], Ops.size());
1606   }
1607
1608   // Extract_vec (Build_vector) generated by custom lowering
1609   // also needs to be customly combined
1610   case ISD::EXTRACT_VECTOR_ELT: {
1611     SDValue Arg = N->getOperand(0);
1612     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1613       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1614         unsigned Element = Const->getZExtValue();
1615         return Arg->getOperand(Element);
1616       }
1617     }
1618     if (Arg.getOpcode() == ISD::BITCAST &&
1619         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1620       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1621         unsigned Element = Const->getZExtValue();
1622         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1623             Arg->getOperand(0).getOperand(Element));
1624       }
1625     }
1626   }
1627
1628   case ISD::SELECT_CC: {
1629     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1630     //      selectcc x, y, a, b, inv(cc)
1631     //
1632     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1633     //      selectcc x, y, a, b, cc
1634     SDValue LHS = N->getOperand(0);
1635     if (LHS.getOpcode() != ISD::SELECT_CC) {
1636       return SDValue();
1637     }
1638
1639     SDValue RHS = N->getOperand(1);
1640     SDValue True = N->getOperand(2);
1641     SDValue False = N->getOperand(3);
1642     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1643
1644     if (LHS.getOperand(2).getNode() != True.getNode() ||
1645         LHS.getOperand(3).getNode() != False.getNode() ||
1646         RHS.getNode() != False.getNode()) {
1647       return SDValue();
1648     }
1649
1650     switch (NCC) {
1651     default: return SDValue();
1652     case ISD::SETNE: return LHS;
1653     case ISD::SETEQ: {
1654       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1655       LHSCC = ISD::getSetCCInverse(LHSCC,
1656                                   LHS.getOperand(0).getValueType().isInteger());
1657       if (DCI.isBeforeLegalizeOps() ||
1658           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1659         return DAG.getSelectCC(SDLoc(N),
1660                                LHS.getOperand(0),
1661                                LHS.getOperand(1),
1662                                LHS.getOperand(2),
1663                                LHS.getOperand(3),
1664                                LHSCC);
1665       break;
1666     }
1667     }
1668     return SDValue();
1669   }
1670
1671   case AMDGPUISD::EXPORT: {
1672     SDValue Arg = N->getOperand(1);
1673     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1674       break;
1675
1676     SDValue NewArgs[8] = {
1677       N->getOperand(0), // Chain
1678       SDValue(),
1679       N->getOperand(2), // ArrayBase
1680       N->getOperand(3), // Type
1681       N->getOperand(4), // SWZ_X
1682       N->getOperand(5), // SWZ_Y
1683       N->getOperand(6), // SWZ_Z
1684       N->getOperand(7) // SWZ_W
1685     };
1686     SDLoc DL(N);
1687     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1688     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1689   }
1690   case AMDGPUISD::TEXTURE_FETCH: {
1691     SDValue Arg = N->getOperand(1);
1692     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1693       break;
1694
1695     SDValue NewArgs[19] = {
1696       N->getOperand(0),
1697       N->getOperand(1),
1698       N->getOperand(2),
1699       N->getOperand(3),
1700       N->getOperand(4),
1701       N->getOperand(5),
1702       N->getOperand(6),
1703       N->getOperand(7),
1704       N->getOperand(8),
1705       N->getOperand(9),
1706       N->getOperand(10),
1707       N->getOperand(11),
1708       N->getOperand(12),
1709       N->getOperand(13),
1710       N->getOperand(14),
1711       N->getOperand(15),
1712       N->getOperand(16),
1713       N->getOperand(17),
1714       N->getOperand(18),
1715     };
1716     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1717     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1718         NewArgs, 19);
1719   }
1720   }
1721   return SDValue();
1722 }
1723
1724 static bool
1725 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
1726             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
1727   const R600InstrInfo *TII =
1728       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1729   if (!Src.isMachineOpcode())
1730     return false;
1731   switch (Src.getMachineOpcode()) {
1732   case AMDGPU::FNEG_R600:
1733     if (!Neg.getNode())
1734       return false;
1735     Src = Src.getOperand(0);
1736     Neg = DAG.getTargetConstant(1, MVT::i32);
1737     return true;
1738   case AMDGPU::FABS_R600:
1739     if (!Abs.getNode())
1740       return false;
1741     Src = Src.getOperand(0);
1742     Abs = DAG.getTargetConstant(1, MVT::i32);
1743     return true;
1744   case AMDGPU::CONST_COPY: {
1745     unsigned Opcode = ParentNode->getMachineOpcode();
1746     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1747
1748     if (!Sel.getNode())
1749       return false;
1750
1751     SDValue CstOffset = Src.getOperand(0);
1752     if (ParentNode->getValueType(0).isVector())
1753       return false;
1754
1755     // Gather constants values
1756     int SrcIndices[] = {
1757       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1758       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1759       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1760       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1761       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1762       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1763       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1764       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1765       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1766       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1767       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1768     };
1769     std::vector<unsigned> Consts;
1770     for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
1771       int OtherSrcIdx = SrcIndices[i];
1772       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1773       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1774         continue;
1775       if (HasDst) {
1776         OtherSrcIdx--;
1777         OtherSelIdx--;
1778       }
1779       if (RegisterSDNode *Reg =
1780           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1781         if (Reg->getReg() == AMDGPU::ALU_CONST) {
1782           ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
1783               ParentNode->getOperand(OtherSelIdx));
1784           Consts.push_back(Cst->getZExtValue());
1785         }
1786       }
1787     }
1788
1789     ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
1790     Consts.push_back(Cst->getZExtValue());
1791     if (!TII->fitsConstReadLimitations(Consts)) {
1792       return false;
1793     }
1794
1795     Sel = CstOffset;
1796     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
1797     return true;
1798   }
1799   case AMDGPU::MOV_IMM_I32:
1800   case AMDGPU::MOV_IMM_F32: {
1801     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
1802     uint64_t ImmValue = 0;
1803
1804
1805     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
1806       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
1807       float FloatValue = FPC->getValueAPF().convertToFloat();
1808       if (FloatValue == 0.0) {
1809         ImmReg = AMDGPU::ZERO;
1810       } else if (FloatValue == 0.5) {
1811         ImmReg = AMDGPU::HALF;
1812       } else if (FloatValue == 1.0) {
1813         ImmReg = AMDGPU::ONE;
1814       } else {
1815         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
1816       }
1817     } else {
1818       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
1819       uint64_t Value = C->getZExtValue();
1820       if (Value == 0) {
1821         ImmReg = AMDGPU::ZERO;
1822       } else if (Value == 1) {
1823         ImmReg = AMDGPU::ONE_INT;
1824       } else {
1825         ImmValue = Value;
1826       }
1827     }
1828
1829     // Check that we aren't already using an immediate.
1830     // XXX: It's possible for an instruction to have more than one
1831     // immediate operand, but this is not supported yet.
1832     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
1833       if (!Imm.getNode())
1834         return false;
1835       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
1836       assert(C);
1837       if (C->getZExtValue())
1838         return false;
1839       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
1840     }
1841     Src = DAG.getRegister(ImmReg, MVT::i32);
1842     return true;
1843   }
1844   default:
1845     return false;
1846   }
1847 }
1848
1849
1850 /// \brief Fold the instructions after selecting them
1851 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
1852                                             SelectionDAG &DAG) const {
1853   const R600InstrInfo *TII =
1854       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1855   if (!Node->isMachineOpcode())
1856     return Node;
1857   unsigned Opcode = Node->getMachineOpcode();
1858   SDValue FakeOp;
1859
1860   std::vector<SDValue> Ops;
1861   for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
1862               I != E; ++I)
1863           Ops.push_back(*I);
1864
1865   if (Opcode == AMDGPU::DOT_4) {
1866     int OperandIdx[] = {
1867       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1868       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1869       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1870       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1871       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1872       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1873       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1874       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1875         };
1876     int NegIdx[] = {
1877       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
1878       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
1879       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
1880       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
1881       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
1882       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
1883       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
1884       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
1885     };
1886     int AbsIdx[] = {
1887       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
1888       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
1889       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
1890       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
1891       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
1892       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
1893       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
1894       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
1895     };
1896     for (unsigned i = 0; i < 8; i++) {
1897       if (OperandIdx[i] < 0)
1898         return Node;
1899       SDValue &Src = Ops[OperandIdx[i] - 1];
1900       SDValue &Neg = Ops[NegIdx[i] - 1];
1901       SDValue &Abs = Ops[AbsIdx[i] - 1];
1902       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1903       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1904       if (HasDst)
1905         SelIdx--;
1906       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1907       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
1908         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1909     }
1910   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
1911     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
1912       SDValue &Src = Ops[i];
1913       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
1914         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1915     }
1916   } else if (Opcode == AMDGPU::CLAMP_R600) {
1917     SDValue Src = Node->getOperand(0);
1918     if (!Src.isMachineOpcode() ||
1919         !TII->hasInstrModifiers(Src.getMachineOpcode()))
1920       return Node;
1921     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
1922         AMDGPU::OpName::clamp);
1923     if (ClampIdx < 0)
1924       return Node;
1925     std::vector<SDValue> Ops;
1926     unsigned NumOp = Src.getNumOperands();
1927     for(unsigned i = 0; i < NumOp; ++i)
1928           Ops.push_back(Src.getOperand(i));
1929     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
1930     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
1931         Node->getVTList(), Ops);
1932   } else {
1933     if (!TII->hasInstrModifiers(Opcode))
1934       return Node;
1935     int OperandIdx[] = {
1936       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1937       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1938       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
1939     };
1940     int NegIdx[] = {
1941       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
1942       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
1943       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
1944     };
1945     int AbsIdx[] = {
1946       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
1947       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
1948       -1
1949     };
1950     for (unsigned i = 0; i < 3; i++) {
1951       if (OperandIdx[i] < 0)
1952         return Node;
1953       SDValue &Src = Ops[OperandIdx[i] - 1];
1954       SDValue &Neg = Ops[NegIdx[i] - 1];
1955       SDValue FakeAbs;
1956       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
1957       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1958       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1959       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
1960       if (HasDst) {
1961         SelIdx--;
1962         ImmIdx--;
1963       }
1964       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1965       SDValue &Imm = Ops[ImmIdx];
1966       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
1967         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1968     }
1969   }
1970
1971   return Node;
1972 }