Track IR ordering of SelectionDAG nodes 2/4.

[oota-llvm.git] / lib / Target / R600 / R600ISelLowering.cpp
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp

index 85187f8fc516597a55f7b76385ca6fb1e28284c4..de6c5046a72498fcb514dece28f9f98a43f59363 100644 (file)
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -28,7 +28,6 @@ using namespace llvm;
  R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
      AMDGPUTargetLowering(TM),
      TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
-  setOperationAction(ISD::MUL, MVT::i64, Expand);
    addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
    addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
    addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
@@ -44,34 +43,48 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::AND,  MVT::v4i32, Expand);
    setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
+  setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
+  setOperationAction(ISD::OR, MVT::v4i32, Expand);
+  setOperationAction(ISD::OR, MVT::v2i32, Expand);
    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::SHL, MVT::v4i32, Expand);
+  setOperationAction(ISD::SHL, MVT::v2i32, Expand);
+  setOperationAction(ISD::SRL, MVT::v4i32, Expand);
+  setOperationAction(ISD::SRL, MVT::v2i32, Expand);
+  setOperationAction(ISD::SRA, MVT::v4i32, Expand);
+  setOperationAction(ISD::SRA, MVT::v2i32, Expand);
+  setOperationAction(ISD::SUB, MVT::v4i32, Expand);
+  setOperationAction(ISD::SUB, MVT::v2i32, Expand);
    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
    setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
    setOperationAction(ISD::UREM, MVT::v4i32, Expand);
    setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+  setOperationAction(ISD::XOR, MVT::v4i32, Expand);
+  setOperationAction(ISD::XOR, MVT::v2i32, Expand);
  
-  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  
    setOperationAction(ISD::FSUB, MVT::f32, Expand);
  
    setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
-  setOperationAction(ISD::FPOW, MVT::f32, Custom);
-
-  setOperationAction(ISD::ROTL, MVT::i32, Custom);
  
    setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
    setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  
-  setOperationAction(ISD::SETCC, MVT::i32, Custom);
-  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
    setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  
    setOperationAction(ISD::SELECT, MVT::i32, Custom);
    setOperationAction(ISD::SELECT, MVT::f32, Custom);
  
+  setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
+  setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
+
    // Legalize loads and stores to the private address space.
    setOperationAction(ISD::LOAD, MVT::i32, Custom);
    setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -90,8 +103,12 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
  
    setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
    setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::SELECT_CC);
  
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    setSchedulingPreference(Sched::VLIW);
  }
  
@@ -103,7 +120,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
  
    switch (MI->getOpcode()) {
    default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
-  case AMDGPU::SHADER_TYPE: break;
    case AMDGPU::CLAMP_R600: {
      MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
                                                     AMDGPU::MOV,
@@ -148,7 +164,13 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
      TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
                       MI->getOperand(1).getImm());
      break;
-
+  case AMDGPU::CONST_COPY: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
+        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
+    TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
+        MI->getOperand(1).getImm());
+    break;
+  }
  
    case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
    case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
@@ -164,23 +186,99 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
    case AMDGPU::TXD: {
      unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
      unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-
+    MachineOperand &RID = MI->getOperand(4);
+    MachineOperand &SID = MI->getOperand(5);
+    unsigned TextureId = MI->getOperand(6).getImm();
+    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
+    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
+
+    switch (TextureId) {
+    case 5: // Rect
+      CTX = CTY = 0;
+      break;
+    case 6: // Shadow1D
+      SrcW = SrcZ;
+      break;
+    case 7: // Shadow2D
+      SrcW = SrcZ;
+      break;
+    case 8: // ShadowRect
+      CTX = CTY = 0;
+      SrcW = SrcZ;
+      break;
+    case 9: // 1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 10: // 2DArray
+      CTZ = 0;
+      break;
+    case 11: // Shadow1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 12: // Shadow2DArray
+      CTZ = 0;
+      break;
+    }
      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
              .addOperand(MI->getOperand(3))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
              .addOperand(MI->getOperand(2))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
              .addOperand(MI->getOperand(0))
              .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW)
              .addReg(T0, RegState::Implicit)
              .addReg(T1, RegState::Implicit);
      break;
@@ -189,23 +287,100 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
    case AMDGPU::TXD_SHADOW: {
      unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
      unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    MachineOperand &RID = MI->getOperand(4);
+    MachineOperand &SID = MI->getOperand(5);
+    unsigned TextureId = MI->getOperand(6).getImm();
+    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
+    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
+
+    switch (TextureId) {
+    case 5: // Rect
+      CTX = CTY = 0;
+      break;
+    case 6: // Shadow1D
+      SrcW = SrcZ;
+      break;
+    case 7: // Shadow2D
+      SrcW = SrcZ;
+      break;
+    case 8: // ShadowRect
+      CTX = CTY = 0;
+      SrcW = SrcZ;
+      break;
+    case 9: // 1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 10: // 2DArray
+      CTZ = 0;
+      break;
+    case 11: // Shadow1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 12: // Shadow2DArray
+      CTZ = 0;
+      break;
+    }
  
      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
              .addOperand(MI->getOperand(3))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
              .addOperand(MI->getOperand(2))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
              .addOperand(MI->getOperand(0))
              .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW)
              .addReg(T0, RegState::Implicit)
              .addReg(T1, RegState::Implicit);
      break;
@@ -213,8 +388,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
  
    case AMDGPU::BRANCH:
        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
-              .addOperand(MI->getOperand(0))
-              .addReg(0);
+              .addOperand(MI->getOperand(0));
        break;
  
    case AMDGPU::BRANCH_COND_f32: {
@@ -225,7 +399,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
                .addImm(OPCODE_IS_NOT_ZERO)
                .addImm(0); // Flags
      TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
              .addOperand(MI->getOperand(0))
              .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
      break;
@@ -239,7 +413,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
              .addImm(OPCODE_IS_NOT_ZERO_INT)
              .addImm(0); // Flags
      TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
             .addOperand(MI->getOperand(0))
              .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
      break;
@@ -298,71 +472,13 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
  // Custom DAG Lowering Operations
  //===----------------------------------------------------------------------===//
  
-using namespace llvm::Intrinsic;
-using namespace llvm::AMDGPUIntrinsic;
-
-static SDValue
-InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
-    unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
-    SDValue Scalar, SDValue Chain) {
-  if (!ExportMap[Slot]) {
-    SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
-      DL, MVT::v4f32,
-      DAG.getUNDEF(MVT::v4f32),
-      Scalar,
-      DAG.getConstant(Channel, MVT::i32));
-
-    unsigned Mask = 1 << Channel;
-
-    const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
-        DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
-        DAG.getConstant(Mask, MVT::i32)};
-
-    SDValue Res =  DAG.getNode(
-        AMDGPUISD::EXPORT,
-        DL,
-        MVT::Other,
-        Ops, 6);
-     ExportMap[Slot] = Res.getNode();
-     return Res;
-  }
-
-  SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
-  SDValue PreviousVector = ExportInstruction->getOperand(1);
-  SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
-      DL, MVT::v4f32,
-      PreviousVector,
-      Scalar,
-      DAG.getConstant(Channel, MVT::i32));
-
-  unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
-      ->getZExtValue();
-  Mask |= (1 << Channel);
-
-  const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
-      DAG.getConstant(Inst, MVT::i32),
-      DAG.getConstant(Type, MVT::i32),
-      DAG.getConstant(Slot, MVT::i32),
-      DAG.getConstant(Mask, MVT::i32)};
-
-  DAG.UpdateNodeOperands(ExportInstruction,
-      Ops, 6);
-
-  return Chain;
-
-}
-
  SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    switch (Op.getOpcode()) {
    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
-  case ISD::ROTL: return LowerROTL(Op, DAG);
    case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
    case ISD::SELECT: return LowerSELECT(Op, DAG);
-  case ISD::SETCC: return LowerSETCC(Op, DAG);
    case ISD::STORE: return LowerSTORE(Op, DAG);
    case ISD::LOAD: return LowerLOAD(Op, DAG);
-  case ISD::FPOW: return LowerFPOW(Op, DAG);
    case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
    case ISD::INTRINSIC_VOID: {
      SDValue Chain = Op.getOperand(0);
@@ -375,18 +491,21 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
        int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
        unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
        MFI->LiveOuts.push_back(Reg);
-      return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
+      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
      }
-    case AMDGPUIntrinsic::R600_store_pixel_color: {
-      MachineFunction &MF = DAG.getMachineFunction();
-      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-
-      SDNode **OutputsMap = MFI->Outputs;
-      return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
-          RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
-          Chain);
-
+    case AMDGPUIntrinsic::R600_store_swizzle: {
+      const SDValue Args[8] = {
+        Chain,
+        Op.getOperand(2), // Export Value
+        Op.getOperand(3), // ArrayBase
+        Op.getOperand(4), // Type
+        DAG.getConstant(0, MVT::i32), // SWZ_X
+        DAG.getConstant(1, MVT::i32), // SWZ_Y
+        DAG.getConstant(2, MVT::i32), // SWZ_Z
+        DAG.getConstant(3, MVT::i32) // SWZ_W
+      };
+      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
+          Args, 8);
      }
  
      // default for switch(IntrinsicID)
@@ -399,13 +518,17 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
      unsigned IntrinsicID =
                           cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
      EVT VT = Op.getValueType();
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
      switch(IntrinsicID) {
      default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
      case AMDGPUIntrinsic::R600_load_input: {
        int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
        unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      MRI.addLiveIn(Reg);
+      return DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), Reg, VT);
      }
  
      case AMDGPUIntrinsic::R600_interp_input: {
@@ -437,42 +560,132 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
  
        return SDValue(interp, slot % 2);
      }
+    case AMDGPUIntrinsic::R600_tex:
+    case AMDGPUIntrinsic::R600_texc:
+    case AMDGPUIntrinsic::R600_txl:
+    case AMDGPUIntrinsic::R600_txlc:
+    case AMDGPUIntrinsic::R600_txb:
+    case AMDGPUIntrinsic::R600_txbc:
+    case AMDGPUIntrinsic::R600_txf:
+    case AMDGPUIntrinsic::R600_txq:
+    case AMDGPUIntrinsic::R600_ddx:
+    case AMDGPUIntrinsic::R600_ddy: {
+      unsigned TextureOp;
+      switch (IntrinsicID) {
+      case AMDGPUIntrinsic::R600_tex:
+        TextureOp = 0;
+        break;
+      case AMDGPUIntrinsic::R600_texc:
+        TextureOp = 1;
+        break;
+      case AMDGPUIntrinsic::R600_txl:
+        TextureOp = 2;
+        break;
+      case AMDGPUIntrinsic::R600_txlc:
+        TextureOp = 3;
+        break;
+      case AMDGPUIntrinsic::R600_txb:
+        TextureOp = 4;
+        break;
+      case AMDGPUIntrinsic::R600_txbc:
+        TextureOp = 5;
+        break;
+      case AMDGPUIntrinsic::R600_txf:
+        TextureOp = 6;
+        break;
+      case AMDGPUIntrinsic::R600_txq:
+        TextureOp = 7;
+        break;
+      case AMDGPUIntrinsic::R600_ddx:
+        TextureOp = 8;
+        break;
+      case AMDGPUIntrinsic::R600_ddy:
+        TextureOp = 9;
+        break;
+      default:
+        llvm_unreachable("Unknow Texture Operation");
+      }
+
+      SDValue TexArgs[19] = {
+        DAG.getConstant(TextureOp, MVT::i32),
+        Op.getOperand(1),
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(1, MVT::i32),
+        DAG.getConstant(2, MVT::i32),
+        DAG.getConstant(3, MVT::i32),
+        Op.getOperand(2),
+        Op.getOperand(3),
+        Op.getOperand(4),
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(1, MVT::i32),
+        DAG.getConstant(2, MVT::i32),
+        DAG.getConstant(3, MVT::i32),
+        Op.getOperand(5),
+        Op.getOperand(6),
+        Op.getOperand(7),
+        Op.getOperand(8),
+        Op.getOperand(9),
+        Op.getOperand(10)
+      };
+      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
+    }
+    case AMDGPUIntrinsic::AMDGPU_dp4: {
+      SDValue Args[8] = {
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(0, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(0, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(1, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(1, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(2, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(2, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(3, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(3, MVT::i32))
+      };
+      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
+    }
  
-    case r600_read_ngroups_x:
+    case Intrinsic::r600_read_ngroups_x:
        return LowerImplicitParameter(DAG, VT, DL, 0);
-    case r600_read_ngroups_y:
+    case Intrinsic::r600_read_ngroups_y:
        return LowerImplicitParameter(DAG, VT, DL, 1);
-    case r600_read_ngroups_z:
+    case Intrinsic::r600_read_ngroups_z:
        return LowerImplicitParameter(DAG, VT, DL, 2);
-    case r600_read_global_size_x:
+    case Intrinsic::r600_read_global_size_x:
        return LowerImplicitParameter(DAG, VT, DL, 3);
-    case r600_read_global_size_y:
+    case Intrinsic::r600_read_global_size_y:
        return LowerImplicitParameter(DAG, VT, DL, 4);
-    case r600_read_global_size_z:
+    case Intrinsic::r600_read_global_size_z:
        return LowerImplicitParameter(DAG, VT, DL, 5);
-    case r600_read_local_size_x:
+    case Intrinsic::r600_read_local_size_x:
        return LowerImplicitParameter(DAG, VT, DL, 6);
-    case r600_read_local_size_y:
+    case Intrinsic::r600_read_local_size_y:
        return LowerImplicitParameter(DAG, VT, DL, 7);
-    case r600_read_local_size_z:
+    case Intrinsic::r600_read_local_size_z:
        return LowerImplicitParameter(DAG, VT, DL, 8);
  
-    case r600_read_tgid_x:
+    case Intrinsic::r600_read_tgid_x:
        return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                    AMDGPU::T1_X, VT);
-    case r600_read_tgid_y:
+    case Intrinsic::r600_read_tgid_y:
        return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                    AMDGPU::T1_Y, VT);
-    case r600_read_tgid_z:
+    case Intrinsic::r600_read_tgid_z:
        return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                    AMDGPU::T1_Z, VT);
-    case r600_read_tidig_x:
+    case Intrinsic::r600_read_tidig_x:
        return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                    AMDGPU::T0_X, VT);
-    case r600_read_tidig_y:
+    case Intrinsic::r600_read_tidig_y:
        return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                    AMDGPU::T0_Y, VT);
-    case r600_read_tidig_z:
+    case Intrinsic::r600_read_tidig_z:
        return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                    AMDGPU::T0_Z, VT);
      }
@@ -509,53 +722,15 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
  SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
    return DAG.getNode(
        ISD::SETCC,
-      Op.getDebugLoc(),
+      SDLoc(Op),
        MVT::i1,
        Op, DAG.getConstantFP(0.0f, MVT::f32),
        DAG.getCondCode(ISD::SETNE)
        );
  }
  
-SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  SDValue CC = Op.getOperand(1);
-  SDValue LHS   = Op.getOperand(2);
-  SDValue RHS   = Op.getOperand(3);
-  SDValue JumpT  = Op.getOperand(4);
-  SDValue CmpValue;
-  SDValue Result;
-
-  if (LHS.getValueType() == MVT::i32) {
-    CmpValue = DAG.getNode(
-        ISD::SELECT_CC,
-        Op.getDebugLoc(),
-        MVT::i32,
-        LHS, RHS,
-        DAG.getConstant(-1, MVT::i32),
-        DAG.getConstant(0, MVT::i32),
-        CC);
-  } else if (LHS.getValueType() == MVT::f32) {
-    CmpValue = DAG.getNode(
-        ISD::SELECT_CC,
-        Op.getDebugLoc(),
-        MVT::f32,
-        LHS, RHS,
-        DAG.getConstantFP(1.0f, MVT::f32),
-        DAG.getConstantFP(0.0f, MVT::f32),
-        CC);
-  } else {
-    assert(0 && "Not valid type for br_cc");
-  }
-  Result = DAG.getNode(
-      AMDGPUISD::BRANCH_COND,
-      CmpValue.getDebugLoc(),
-      MVT::Other, Chain,
-      JumpT, CmpValue);
-  return Result;
-}
-
  SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                                   DebugLoc DL,
+                                                   SDLoc DL,
                                                     unsigned DwordOffset) const {
    unsigned ByteOffset = DwordOffset * 4;
    PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
@@ -584,18 +759,6 @@ SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const
    return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
  }
  
-SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-
-  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
-                     Op.getOperand(0),
-                     Op.getOperand(0),
-                     DAG.getNode(ISD::SUB, DL, VT,
-                                 DAG.getConstant(32, MVT::i32),
-                                 Op.getOperand(1)));
-}
-
  bool R600TargetLowering::isZero(SDValue Op) const {
    if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
      return Cst->isNullValue();
@@ -607,7 +770,7 @@ bool R600TargetLowering::isZero(SDValue Op) const {
  }
  
  SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
    EVT VT = Op.getValueType();
  
    SDValue LHS = Op.getOperand(0);
@@ -622,12 +785,37 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
  
    // Check if we can lower this to a native operation.
  
+  // Try to lower to a SET* instruction:
+  //
+  // SET* can match the following patterns:
+  //
+  // select_cc f32, f32, -1,  0, cc_any
+  // select_cc f32, f32, 1.0f, 0.0f, cc_any
+  // select_cc i32, i32, -1,  0, cc_any
+  //
+
+  // Move hardware True/False values to the correct operand.
+  if (isHWTrueValue(False) && isHWFalseValue(True)) {
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    std::swap(False, True);
+    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
+  }
+
+  if (isHWTrueValue(True) && isHWFalseValue(False) &&
+      (CompareVT == VT || VT == MVT::i32)) {
+    // This can be matched by a SET* instruction.
+    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+  }
+
    // Try to lower to a CND* instruction:
-  // CND* instructions requires RHS to be zero.  Some SELECT_CC nodes that
-  // can be lowered to CND* instructions can also be lowered to SET*
-  // instructions.  CND* instructions are cheaper, because they dont't
-  // require additional instructions to convert their result to the correct
-  // value type, so this check should be first.
+  //
+  // CND* can match the following patterns:
+  //
+  // select_cc f32, 0.0, f32, f32, cc_any
+  // select_cc f32, 0.0, i32, i32, cc_any
+  // select_cc i32, 0,   f32, f32, cc_any
+  // select_cc i32, 0,   i32, i32, cc_any
+  //
    if (isZero(LHS) || isZero(RHS)) {
      SDValue Cond = (isZero(LHS) ? RHS : LHS);
      SDValue Zero = (isZero(LHS) ? LHS : RHS);
@@ -669,50 +857,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
      return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
    }
  
-  // Try to lower to a SET* instruction:
-  // We need all the operands of SELECT_CC to have the same value type, so if
-  // necessary we need to change True and False to be the same type as LHS and
-  // RHS, and then convert the result of the select_cc back to the correct type.
-
-  // Move hardware True/False values to the correct operand.
-  if (isHWTrueValue(False) && isHWFalseValue(True)) {
-    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-    std::swap(False, True);
-    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
-  }
-
-  if (isHWTrueValue(True) && isHWFalseValue(False)) {
-    if (CompareVT !=  VT) {
-      if (VT == MVT::f32 && CompareVT == MVT::i32) {
-        SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
-            LHS, RHS,
-            DAG.getConstant(-1, MVT::i32),
-            DAG.getConstant(0, MVT::i32),
-            CC);
-        // Convert integer values of true (-1) and false (0) to fp values of
-        // true (1.0f) and false (0.0f).
-        SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
-                                                  DAG.getConstant(1, MVT::i32));
-        return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
-      } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
-        SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
-            LHS, RHS,
-            DAG.getConstantFP(1.0f, MVT::f32),
-            DAG.getConstantFP(0.0f, MVT::f32),
-            CC);
-        // Convert fp values of true (1.0f) and false (0.0f) to integer values
-        // of true (-1) and false (0).
-        SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
-        return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
-      } else {
-        // I don't think there will be any other type pairings.
-        assert(!"Unhandled operand type parings in SELECT_CC");
-      }
-    } else {
-      // This SELECT_CC is already legal.
-      return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
-    }
-  }
  
    // Possible Min/Max pattern
    SDValue MinMax = LowerMinMax(Op, DAG);
@@ -747,7 +891,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
  
  SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    return DAG.getNode(ISD::SELECT_CC,
-      Op.getDebugLoc(),
+      SDLoc(Op),
        Op.getValueType(),
        Op.getOperand(0),
        DAG.getConstant(0, MVT::i32),
@@ -756,48 +900,6 @@ SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
        DAG.getCondCode(ISD::SETNE));
  }
  
-SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Cond;
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue CC  = Op.getOperand(2);
-  DebugLoc DL = Op.getDebugLoc();
-  assert(Op.getValueType() == MVT::i32);
-  if (LHS.getValueType() == MVT::i32) {
-    Cond = DAG.getNode(
-        ISD::SELECT_CC,
-        Op.getDebugLoc(),
-        MVT::i32,
-        LHS, RHS,
-        DAG.getConstant(-1, MVT::i32),
-        DAG.getConstant(0, MVT::i32),
-        CC);
-  } else if (LHS.getValueType() == MVT::f32) {
-    Cond = DAG.getNode(
-        ISD::SELECT_CC,
-        Op.getDebugLoc(),
-        MVT::f32,
-        LHS, RHS,
-        DAG.getConstantFP(1.0f, MVT::f32),
-        DAG.getConstantFP(0.0f, MVT::f32),
-        CC);
-    Cond = DAG.getNode(
-        ISD::FP_TO_SINT,
-        DL,
-        MVT::i32,
-        Cond);
-  } else {
-    assert(0 && "Not valid type for set_cc");
-  }
-  Cond = DAG.getNode(
-      ISD::AND,
-      DL,
-      MVT::i32,
-      DAG.getConstant(1, MVT::i32),
-      Cond);
-  return Cond;
-}
-
  /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
  /// convert these pointers to a register index.  Each register holds
  /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
@@ -820,7 +922,7 @@ SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
    default: llvm_unreachable("Invalid stack width");
    }
  
-  return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
+  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
                       DAG.getConstant(SRLPad, MVT::i32));
  }
  
@@ -854,7 +956,7 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth,
  }
  
  SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
    StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
    SDValue Chain = Op.getOperand(0);
    SDValue Value = Op.getOperand(1);
@@ -916,7 +1018,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
        Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
      }
      Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
-    DAG.getTargetConstant(0, MVT::i32)); // Channel 
+    DAG.getTargetConstant(0, MVT::i32)); // Channel
    }
  
    return Chain;
@@ -966,7 +1068,7 @@ ConstantAddressBlock(unsigned AddressSpace) {
  SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
  {
    EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
    LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
    SDValue Chain = Op.getOperand(0);
    SDValue Ptr = Op.getOperand(1);
@@ -976,7 +1078,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
    if (ConstantBlock > -1) {
      SDValue Result;
      if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
-        dyn_cast<Constant>(LoadNode->getSrcValue())) {
+        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
+        dyn_cast<ConstantSDNode>(Ptr)) {
        SDValue Slots[4];
        for (unsigned i = 0; i < 4; i++) {
          // We want Const position encoded with the following formula :
@@ -992,7 +1095,9 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
      } else {
        // non constant ptr cant be folded, keeps it as a v4f32 load
        Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
-          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
+          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
+          DAG.getConstant(LoadNode->getAddressSpace() -
+                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
            );
      }
  
@@ -1057,15 +1162,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
    return DAG.getMergeValues(Ops, 2, DL);
  }
  
-SDValue R600TargetLowering::LowerFPOW(SDValue Op,
-    SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-  SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
-  SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
-  return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
-}
-
  /// XXX Only kernel functions are supported, so we can assume for now that
  /// every function is a kernel function, but in the future we should use
  /// separate calling conventions for kernel and non-kernel functions.
@@ -1074,7 +1170,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
                                        CallingConv::ID CallConv,
                                        bool isVarArg,
                                        const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SDLoc DL, SelectionDAG &DAG,
                                        SmallVectorImpl<SDValue> &InVals) const {
    unsigned ParamOffsetBytes = 36;
    Function::const_arg_iterator FuncArg =
@@ -1097,7 +1193,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
                                                      AMDGPUAS::PARAM_I_ADDRESS);
      SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
                                  DAG.getConstant(ParamOffsetBytes, MVT::i32),
-                                       MachinePointerInfo(new Argument(PtrTy)),
+                                       MachinePointerInfo(UndefValue::get(PtrTy)),
                                         ArgVT, false, false, ArgBytes);
      InVals.push_back(Arg);
      ParamOffsetBytes += ArgBytes;
@@ -1105,7 +1201,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
    return Chain;
  }
  
-EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
+EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
     if (!VT.isVector()) return MVT::i32;
     return VT.changeVectorElementTypeToInteger();
  }
@@ -1123,11 +1219,40 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::FP_ROUND: {
        SDValue Arg = N->getOperand(0);
        if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
-        return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
+        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
                             Arg.getOperand(0));
        }
        break;
      }
+
+  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
+  // (i32 select_cc f32, f32, -1, 0 cc)
+  //
+  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
+  // this to one of the SET*_DX10 instructions.
+  case ISD::FP_TO_SINT: {
+    SDValue FNeg = N->getOperand(0);
+    if (FNeg.getOpcode() != ISD::FNEG) {
+      return SDValue();
+    }
+    SDValue SelectCC = FNeg.getOperand(0);
+    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
+        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
+        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
+        !isHWTrueValue(SelectCC.getOperand(2)) ||
+        !isHWFalseValue(SelectCC.getOperand(3))) {
+      return SDValue();
+    }
+
+    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
+                           SelectCC.getOperand(0), // LHS
+                           SelectCC.getOperand(1), // RHS
+                           DAG.getConstant(-1, MVT::i32), // True
+                           DAG.getConstant(0, MVT::i32),  // Flase
+                           SelectCC.getOperand(4)); // CC
+
+    break;
+  }
    // Extract_vec (Build_vector) generated by custom lowering
    // also needs to be customly combined
    case ISD::EXTRACT_VECTOR_ELT: {
@@ -1142,11 +1267,87 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
          Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
        if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
          unsigned Element = Const->getZExtValue();
-        return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
+        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
              Arg->getOperand(0).getOperand(Element));
        }
      }
    }
+
+  case ISD::SELECT_CC: {
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
+    //      selectcc x, y, a, b, inv(cc)
+    //
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
+    //      selectcc x, y, a, b, cc
+    SDValue LHS = N->getOperand(0);
+    if (LHS.getOpcode() != ISD::SELECT_CC) {
+      return SDValue();
+    }
+
+    SDValue RHS = N->getOperand(1);
+    SDValue True = N->getOperand(2);
+    SDValue False = N->getOperand(3);
+    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+    if (LHS.getOperand(2).getNode() != True.getNode() ||
+        LHS.getOperand(3).getNode() != False.getNode() ||
+        RHS.getNode() != False.getNode()) {
+      return SDValue();
+    }
+
+    switch (NCC) {
+    default: return SDValue();
+    case ISD::SETNE: return LHS;
+    case ISD::SETEQ: {
+      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
+      LHSCC = ISD::getSetCCInverse(LHSCC,
+                                  LHS.getOperand(0).getValueType().isInteger());
+      return DAG.getSelectCC(SDLoc(N),
+                             LHS.getOperand(0),
+                             LHS.getOperand(1),
+                             LHS.getOperand(2),
+                             LHS.getOperand(3),
+                             LHSCC);
+    }
+    }
+  }
+  case AMDGPUISD::EXPORT: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+    SDValue NewBldVec[4] = {
+        DAG.getUNDEF(MVT::f32),
+        DAG.getUNDEF(MVT::f32),
+        DAG.getUNDEF(MVT::f32),
+        DAG.getUNDEF(MVT::f32)
+      };
+    SDValue NewArgs[8] = {
+      N->getOperand(0), // Chain
+      SDValue(),
+      N->getOperand(2), // ArrayBase
+      N->getOperand(3), // Type
+      N->getOperand(4), // SWZ_X
+      N->getOperand(5), // SWZ_Y
+      N->getOperand(6), // SWZ_Z
+      N->getOperand(7) // SWZ_W
+    };
+    for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
+      if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
+        if (C->isZero()) {
+          NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
+        } else if (C->isExactlyValue(1.0)) {
+          NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
+        } else {
+          NewBldVec[i] = Arg.getOperand(i);
+        }
+      } else {
+        NewBldVec[i] = Arg.getOperand(i);
+      }
+    }
+    SDLoc DL(N);
+    NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
+    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
+  }
    }
    return SDValue();
  }