AMDGPU: Pattern match ffbh pattern to instruction.

author Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 11 Jan 2016 17:02:00 +0000 (17:02 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 11 Jan 2016 17:02:00 +0000 (17:02 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 11 Jan 2016 17:02:00 +0000 (17:02 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 11 Jan 2016 17:02:00 +0000 (17:02 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 810b325b23054fbaf1341c5fca5629505e243992..fafe58d65b199c12441bc69cad9beebd07d6bfa6 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -282,7 +282,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
    setOperationAction(ISD::SMAX, MVT::i32, Legal);
    setOperationAction(ISD::UMAX, MVT::i32, Legal);
  
-  if (!Subtarget->hasFFBH())
+  if (Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+  else
      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
  
    if (!Subtarget->hasFFBL())
@@ -2170,9 +2172,11 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
  SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
    SDLoc SL(Op);
    SDValue Src = Op.getOperand(0);
-  assert(Src.getValueType() == MVT::i64);
-
    bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
+
+  if (ZeroUndef && Src.getValueType() == MVT::i32)
+    return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
+
    SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
  
    const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
@@ -2507,6 +2511,79 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
    return DAG.getSExtOrTrunc(Mul, DL, VT);
  }
  
+static bool isNegativeOne(SDValue Val) {
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
+    return C->isAllOnesValue();
+  return false;
+}
+
+static bool isCtlzOpc(unsigned Opc) {
+  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
+}
+
+// The native instructions return -1 on 0 input. Optimize out a select that
+// produces -1 on 0.
+//
+// TODO: If zero is not undef, we could also do this if the output is compared
+// against the bitwidth.
+//
+// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
+SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL,
+                                                 SDValue Cond,
+                                                 SDValue LHS,
+                                                 SDValue RHS,
+                                                 DAGCombinerInfo &DCI) const {
+  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+  if (!CmpRhs || !CmpRhs->isNullValue())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+  SDValue CmpLHS = Cond.getOperand(0);
+
+  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
+  if (CCOpcode == ISD::SETEQ &&
+      isCtlzOpc(RHS.getOpcode()) &&
+      RHS.getOperand(0) == CmpLHS &&
+      isNegativeOne(LHS)) {
+    return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, CmpLHS);
+  }
+
+  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
+  if (CCOpcode == ISD::SETNE &&
+      isCtlzOpc(LHS.getOpcode()) &&
+      LHS.getOperand(0) == CmpLHS &&
+      isNegativeOne(RHS)) {
+    return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, CmpLHS);
+  }
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
+                                                   DAGCombinerInfo &DCI) const {
+  SDValue Cond = N->getOperand(0);
+  if (Cond.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDValue LHS = Cond.getOperand(0);
+  SDValue RHS = Cond.getOperand(1);
+  SDValue CC = Cond.getOperand(2);
+
+  SDValue True = N->getOperand(1);
+  SDValue False = N->getOperand(2);
+
+  if (VT == MVT::f32 && Cond.hasOneUse())
+    return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+
+  // There's no reason to not do this if the condition has other uses.
+  if (VT == MVT::i32)
+    return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
+
+  return SDValue();
+}
+
  SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
    SelectionDAG &DAG = DCI.DAG;
@@ -2531,23 +2608,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
      simplifyI24(N1, DCI);
      return SDValue();
    }
-  case ISD::SELECT: {
-    SDValue Cond = N->getOperand(0);
-    if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
-      EVT VT = N->getValueType(0);
-      SDValue LHS = Cond.getOperand(0);
-      SDValue RHS = Cond.getOperand(1);
-      SDValue CC = Cond.getOperand(2);
-
-      SDValue True = N->getOperand(1);
-      SDValue False = N->getOperand(2);
-
-      if (VT == MVT::f32)
-        return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
-    }
-
-    break;
-  }
+  case ISD::SELECT:
+    return performSelectCombine(N, DCI);
    case AMDGPUISD::BFE_I32:
    case AMDGPUISD::BFE_U32: {
      assert(!N->getValueType(0).isVector() &&
@@ -2759,6 +2821,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    NODE_NAME_CASE(BFE_I32)
    NODE_NAME_CASE(BFI)
    NODE_NAME_CASE(BFM)
+  NODE_NAME_CASE(FFBH_U32)
    NODE_NAME_CASE(MUL_U24)
    NODE_NAME_CASE(MUL_I24)
    NODE_NAME_CASE(MAD_U24)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h

index b53a9f611b4d11a509b194c2e314d85c2e69baed..952fd4cc5024dcfbacacb04817dbbd575de57c24 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -69,6 +69,9 @@ private:
    SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
    SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
    SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS,
+                             DAGCombinerInfo &DCI) const;
+  SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  
  protected:
    static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
@@ -265,6 +268,7 @@ enum NodeType : unsigned {
    BFE_I32, // Extract range of bits with sign extension to 32-bits.
    BFI, // (src0 & src1) | (~src0 & src2)
    BFM, // Insert a range of bits into a 32-bit word.
+  FFBH_U32, // ctlz with -1 if input is zero.
    MUL_U24,
    MUL_I24,
    MAD_U24,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td

index 70e589c284299f3c65bf2b8854ce72320ee2a3ee..b7a263e34e78d79779afbc1181f0e61d0bed4736 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -191,6 +191,8 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
  def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
  def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
  
+def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
+
  // Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
  // performing the mulitply.  The result is a 32-bit value.
  def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td

index 779a14e95d22b492ff6148c72be8d6d55db74bff..2245f1417e53c80f016b918b40096e5b37377771 100644 (file)
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -349,7 +349,7 @@ def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
  def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
  def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
  
-def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>;
+def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
  def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
  
  let hasSideEffects = 1 in {
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index f7a9880752f884b235602aa6252e7d22de50ed78..e85d78a69ee6a6620d8f04f754bfd8acb13b581b 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2026,7 +2026,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
  
    case ISD::UINT_TO_FP: {
      return performUCharToFloatCombine(N, DCI);
-
+  }
    case ISD::FADD: {
      if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
        break;
@@ -2108,7 +2108,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
  
      break;
    }
-  }
    case ISD::LOAD:
    case ISD::STORE:
    case ISD::ATOMIC_LOAD:
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td

index b7df058b7c0c825b854b18afbf30e284d9be3b50..89692ab71f4d899813281f1c22d1fd84f9f73081 100644 (file)
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -144,7 +144,7 @@ defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
  defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
  
  defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
-  [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
+  [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))]
  >;
  
  defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll

index d0e0f621df1fd8d44287929af81307944ffdb076..fcff1b585e36518fc85ddb8d4437daa775fcaf6e 100644 (file)
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -150,3 +150,62 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)*
    store i32 %trunc, i32 addrspace(1)* %out.gep
    ret void
  }
+
+; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+ define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  %cmp = icmp eq i32 %val, 0
+  %sel = select i1 %cmp, i32 -1, i32 %ctlz
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  %cmp = icmp ne i32 %val, 0
+  %sel = select i1 %cmp, i32 %ctlz, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; TODO: Should be able to eliminate select here as well.
+; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: s_endpgm
+define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  %cmp = icmp eq i32 %ctlz, 32
+  %sel = select i1 %cmp, i32 -1, i32 %ctlz
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: s_endpgm
+define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  %cmp = icmp ne i32 %ctlz, 32
+  %sel = select i1 %cmp, i32 %ctlz, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

index b019108d11c1f6dff870193a95efbb44e6919caf..9dc99f95c45fe50055e29566f463fac58a74f0d6 100644 (file)
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -2,6 +2,8 @@
  ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
  ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
+declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
+
  declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
  declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
  declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
@@ -131,3 +133,123 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
    store i32 %trunc, i32 addrspace(1)* %out.gep
    ret void
  }
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI-NEXT: buffer_store_dword [[RESULT]],
+ define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp eq i32 %val, 0
+  %sel = select i1 %cmp, i32 -1, i32 %ctlz
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI-NEXT: buffer_store_dword [[RESULT]],
+define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp ne i32 %val, 0
+  %sel = select i1 %cmp, i32 %ctlz, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]]
+; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[VAL]]
+; SI-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc
+; SI-DAG: buffer_store_dword [[RESULT0]]
+; SI-DAG: buffer_store_byte [[RESULT1]]
+; SI: s_endpgm
+ define void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp eq i32 %val, 0
+  %sel = select i1 %cmp, i32 -1, i32 %ctlz
+  store volatile i32 %sel, i32 addrspace(1)* %out
+  store volatile i1 %cmp, i1 addrspace(1)* undef
+  ret void
+}
+
+; Selected on wrong constant
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_0:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: buffer_store_dword
+ define void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp eq i32 %val, 0
+  %sel = select i1 %cmp, i32 0, i32 %ctlz
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; Selected on wrong constant
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_0:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: buffer_store_dword
+define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp ne i32 %val, 0
+  %sel = select i1 %cmp, i32 %ctlz, i32 0
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; Compare on wrong constant
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: buffer_store_dword
+ define void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp eq i32 %val, 1
+  %sel = select i1 %cmp, i32 0, i32 %ctlz
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; Selected on wrong constant
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: buffer_store_dword
+define void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp ne i32 %val, 1
+  %sel = select i1 %cmp, i32 %ctlz, i32 0
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[FFBH]]
+; SI: buffer_store_dword [[RESULT]],
+define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i8, i8 addrspace(1)* %valptr
+  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
+  store i8 %ctlz, i8 addrspace(1)* %out
+  ret void
+}
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 11 Jan 2016 17:02:00 +0000 (17:02 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 11 Jan 2016 17:02:00 +0000 (17:02 +0000)
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.h		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUInstrInfo.td		patch \| blob \| history
lib/Target/AMDGPU/EvergreenInstructions.td		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
test/CodeGen/AMDGPU/ctlz.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ctlz_zero_undef.ll		patch \| blob \| history