ZERO_EXTEND/SIGN_EXTEND/TRUNCATE optimization for AVX2

author Elena Demikhovsky <elena.demikhovsky@intel.com>

Sun, 22 Apr 2012 09:39:03 +0000 (09:39 +0000)

committer Elena Demikhovsky <elena.demikhovsky@intel.com>

Sun, 22 Apr 2012 09:39:03 +0000 (09:39 +0000)
author Elena Demikhovsky <elena.demikhovsky@intel.com>
Sun, 22 Apr 2012 09:39:03 +0000 (09:39 +0000)
committer Elena Demikhovsky <elena.demikhovsky@intel.com>
Sun, 22 Apr 2012 09:39:03 +0000 (09:39 +0000)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 1ed1ee77e466114911544054803ee0e0fd185f94..e72c8d5f2d4c8e0e3583a6d5022cfd20e3ef0709 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4520,8 +4520,10 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
      SDValue Op = N0.getOperand(0);
      if (Op.getValueType().bitsLT(VT)) {
        Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op);
+      AddToWorkList(Op.getNode());
      } else if (Op.getValueType().bitsGT(VT)) {
        Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+      AddToWorkList(Op.getNode());
      }
      return DAG.getZeroExtendInReg(Op, N->getDebugLoc(),
                                    N0.getValueType().getScalarType());
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index a03b97f3215eb7cb7ac93d1b874eb795c8be5eaf..5e52b84efd3ca048ce476e2f0518c3b0c0220ee9 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1222,6 +1222,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setTargetDAGCombine(ISD::LOAD);
    setTargetDAGCombine(ISD::STORE);
    setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
    setTargetDAGCombine(ISD::SIGN_EXTEND);
    setTargetDAGCombine(ISD::TRUNCATE);
    setTargetDAGCombine(ISD::SINT_TO_FP);
@@ -13033,6 +13034,20 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
  
    if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
  
+    if (Subtarget->hasAVX2()) {
+      // AVX2: v4i64 -> v4i32
+
+      // VPERMD
+      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
+      Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
+                                ShufMask);
+
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, DAG.getIntPtrConstant(0));
+    }
+
+    // AVX: v4i64 -> v4i32
      SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
                            DAG.getIntPtrConstant(0));
  
@@ -13057,6 +13072,40 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
    }
    if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
  
+    if (Subtarget->hasAVX2()) {
+      // AVX2: v8i32 -> v8i16
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
+      // PSHUFB
+      SmallVector<SDValue,32> pshufbMask;
+      for (unsigned i = 0; i < 2; ++i) {
+        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+        for (unsigned j = 0; j < 8; ++j)
+          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+      }
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8, &pshufbMask[0], 
+                               32);
+      Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
+
+      static const int ShufMask[] = {0,  2,  -1,  -1};
+      Op = DAG.getVectorShuffle(MVT::v4i64, dl,  Op, DAG.getUNDEF(MVT::v4i64), 
+                                &ShufMask[0]);
+
+      Op =  DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+                        DAG.getIntPtrConstant(0));
+
+      return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+    }
+
      SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
                            DAG.getIntPtrConstant(0));
  
@@ -14822,15 +14871,6 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    if (!Subtarget->hasAVX()) 
      return SDValue();
  
-  // Optimize vectors in AVX mode
-  // Sign extend  v8i16 to v8i32 and
-  //              v4i32 to v4i64
-  //
-  // Divide input vector into two parts
-  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
-  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
-  // concat the vectors to original VT
-
    EVT VT = N->getValueType(0);
    SDValue Op = N->getOperand(0);
    EVT OpVT = Op.getValueType();
@@ -14839,6 +14879,19 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
        (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
  
+    if (Subtarget->hasAVX2()) {
+      return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
+    }
+
+    // Optimize vectors in AVX mode
+    // Sign extend  v8i16 to v8i32 and
+    //              v4i32 to v4i64
+    //
+    // Divide input vector into two parts
+    // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+    // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+    // concat the vectors to original VT
+
      unsigned NumElems = OpVT.getVectorNumElements();
      SmallVector<int,8> ShufMask1(NumElems, -1);
      for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
@@ -14906,6 +14959,9 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
      if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
          ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
  
+      if (Subtarget->hasAVX2())
+        return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
+
        SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
        SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec,
                                            DAG);
@@ -15108,6 +15164,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
    case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
    case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
+  case ISD::ANY_EXTEND:
    case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, Subtarget);
    case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
    case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td

index 35801e43229be8c4f61bcddb1ae0f3fb1697b327..ffc6cbea3f186d054c0db1d898a72b21fcdaa9cb 100644 (file)
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -71,9 +71,14 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS",
                                        SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
  def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
                   SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzmovly  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, 
+                                      SDTCisOpSmallerThanOp<1, 0> ]>>;
+
  def X86vsmovl  : SDNode<"X86ISD::VSEXT_MOVL",
                   SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
-                 
+
  def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
                          [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
  def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index 65e3c1e19fa90dd422300a5c9f407feab81926d5..450d29a8574f1e4e76ff96b62eb89ebe71d0292a 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -5730,14 +5730,26 @@ let Predicates = [HasSSE41] in {
              (PMOVZXDQrm addr:$src)>;
  }
  
+let Predicates = [HasAVX2] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))),
+              (VPMOVZXDQYrr VR128:$src)>;
+    def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
+              (VPMOVZXWDYrr VR128:$src)>;
+  }
+
+  def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+  def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+}
+
  let Predicates = [HasAVX] in {
-def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
-def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
  }
  
  let Predicates = [HasSSE41] in {
-def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
-def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
  }
  
  
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll

new file mode 100755 (executable)

index 0000000..fe87de9
--- /dev/null
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s\r
+\r
+; CHECK: trunc4\r
+; CHECK: vpermd\r
+; CHECK-NOT: vinsert\r
+; CHECK: ret\r
+define <4 x i32> @trunc4(<4 x i64> %A) nounwind {\r
+  %B = trunc <4 x i64> %A to <4 x i32>\r
+  ret <4 x i32>%B\r
+}\r
+\r
+; CHECK: trunc8\r
+; CHECK: vpshufb\r
+; CHECK-NOT: vinsert\r
+; CHECK: ret\r
+\r
+define <8 x i16> @trunc8(<8 x i32> %A) nounwind {\r
+  %B = trunc <8 x i32> %A to <8 x i16>\r
+  ret <8 x i16>%B\r
+}\r
+\r
+; CHECK: sext4\r
+; CHECK: vpmovsxdq\r
+; CHECK-NOT: vinsert\r
+; CHECK: ret\r
+define <4 x i64> @sext4(<4 x i32> %A) nounwind {\r
+  %B = sext <4 x i32> %A to <4 x i64>\r
+  ret <4 x i64>%B\r
+}\r
+\r
+; CHECK: sext8\r
+; CHECK: vpmovsxwd\r
+; CHECK-NOT: vinsert\r
+; CHECK: ret\r
+define <8 x i32> @sext8(<8 x i16> %A) nounwind {\r
+  %B = sext <8 x i16> %A to <8 x i32>\r
+  ret <8 x i32>%B\r
+}\r
+\r
+; CHECK: zext4\r
+; CHECK: vpmovzxdq\r
+; CHECK-NOT: vinsert\r
+; CHECK: ret\r
+define <4 x i64> @zext4(<4 x i32> %A) nounwind {\r
+  %B = zext <4 x i32> %A to <4 x i64>\r
+  ret <4 x i64>%B\r
+}\r
+\r
+; CHECK: zext8\r
+; CHECK: vpmovzxwd\r
+; CHECK-NOT: vinsert\r
+; CHECK: ret\r
+define <8 x i32> @zext8(<8 x i16> %A) nounwind {\r
+  %B = zext <8 x i16> %A to <8 x i32>\r
+  ret <8 x i32>%B\r
+}\r
+; CHECK: zext_8i8_8i32\r
+; CHECK: vpmovzxwd\r
+; CHECK: vpand\r
+; CHECK: ret\r
+define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {\r
+  %B = zext <8 x i8> %A to <8 x i32>  \r
+  ret <8 x i32>%B\r
+}\r
+\r
+\r
+\r
+\r
author	Elena Demikhovsky <elena.demikhovsky@intel.com>
	Sun, 22 Apr 2012 09:39:03 +0000 (09:39 +0000)
committer	Elena Demikhovsky <elena.demikhovsky@intel.com>
	Sun, 22 Apr 2012 09:39:03 +0000 (09:39 +0000)
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86InstrFragmentsSIMD.td		patch \| blob \| history
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
test/CodeGen/X86/avx2-conversions.ll	[new file with mode: 0755]	patch \| blob