X86: If SSE4.1 is missing lower SMUL_LOHI of v4i32 to pmuludq and fix up the high...

author Benjamin Kramer <benny.kra@googlemail.com>

Sun, 27 Apr 2014 18:47:41 +0000 (18:47 +0000)

committer Benjamin Kramer <benny.kra@googlemail.com>

Sun, 27 Apr 2014 18:47:41 +0000 (18:47 +0000)
author Benjamin Kramer <benny.kra@googlemail.com>
Sun, 27 Apr 2014 18:47:41 +0000 (18:47 +0000)
committer Benjamin Kramer <benny.kra@googlemail.com>
Sun, 27 Apr 2014 18:47:41 +0000 (18:47 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index a902cae51350a63e4a2399e247f8fb9849f93c9a..ac5f60c69f52fa71fca11ffe5bca3186d4779e0f 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -941,6 +941,7 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
      setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
      setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
      setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
      setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
      setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
@@ -1062,7 +1063,6 @@ void X86TargetLowering::resetOperationActions() {
  
      // FIXME: Do we need to handle scalar-to-vector here?
      setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
-    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
  
      setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
      setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
@@ -13166,8 +13166,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
    // Emit two multiplies, one for the lower 2 ints and one for the higher 2
    // ints.
    MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+  bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
    unsigned Opcode =
-      Op->getOpcode() == ISD::UMUL_LOHI ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+      (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
    SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
                               DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
    SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
@@ -13179,6 +13180,20 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
    const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
    SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
  
+  // If we have a signed multiply but no PMULDQ fix up the high parts of a
+  // unsigned multiply.
+  if (IsSigned && !Subtarget->hasSSE41()) {
+    SDValue ShAmt =
+        DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+    SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+    SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+    SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+    Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
+  }
+
    return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
  }
  
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll

index 981c317157f1a26b9d53433c0330f37a6cbeed3e..5738c94e37bbc881614fb2735d272fa27d290b29 100644 (file)
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -1,19 +1,20 @@
-; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE
  ; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX
  
  define <4 x i32> @test1(<4 x i32> %a) {
    %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
    ret <4 x i32> %div
  
-; SSE-LABEL: test1:
-; SSE: pmuludq
-; SSE: pshufd  $57
-; SSE: pmuludq
-; SSE: shufps  $-35
-; SSE: psubd
-; SSE: psrld $1
-; SSE: padd
-; SSE: psrld $2
+; SSE41-LABEL: test1:
+; SSE41: pmuludq
+; SSE41: pshufd        $57
+; SSE41: pmuludq
+; SSE41: shufps        $-35
+; SSE41: psubd
+; SSE41: psrld $1
+; SSE41: padd
+; SSE41: psrld $2
  
  ; AVX-LABEL: test1:
  ; AVX: vpmuludq
@@ -46,12 +47,12 @@ define <8 x i16> @test3(<8 x i16> %a) {
    %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
    ret <8 x i16> %div
  
-; SSE-LABEL: test3:
-; SSE: pmulhuw
-; SSE: psubw
-; SSE: psrlw $1
-; SSE: paddw
-; SSE: psrlw $2
+; SSE41-LABEL: test3:
+; SSE41: pmulhuw
+; SSE41: psubw
+; SSE41: psrlw $1
+; SSE41: paddw
+; SSE41: psrlw $2
  
  ; AVX-LABEL: test3:
  ; AVX: vpmulhuw
@@ -78,11 +79,11 @@ define <8 x i16> @test5(<8 x i16> %a) {
    %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
    ret <8 x i16> %div
  
-; SSE-LABEL: test5:
-; SSE: pmulhw
-; SSE: psrlw $15
-; SSE: psraw $1
-; SSE: paddw
+; SSE41-LABEL: test5:
+; SSE41: pmulhw
+; SSE41: psrlw $15
+; SSE41: psraw $1
+; SSE41: paddw
  
  ; AVX-LABEL: test5:
  ; AVX: vpmulhw
@@ -112,13 +113,29 @@ define <4 x i32> @test8(<4 x i32> %a) {
    %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
    ret <4 x i32> %div
  
+; SSE41-LABEL: test8:
+; SSE41: pmuldq
+; SSE41: pshufd        $57
+; SSE41-NOT: pshufd    $57
+; SSE41: pmuldq
+; SSE41: shufps        $-35
+; SSE41: pshufd        $-40
+; SSE41: padd
+; SSE41: psrld $31
+; SSE41: psrad $2
+; SSE41: padd
+
  ; SSE-LABEL: test8:
-; SSE: pmuldq
+; SSE: psrad $31
+; SSE: pand
+; SSE: paddd
+; SSE: pmuludq
  ; SSE: pshufd  $57
  ; SSE-NOT: pshufd      $57
-; SSE: pmuldq
+; SSE: pmuludq
  ; SSE: shufps  $-35
  ; SSE: pshufd  $-40
+; SSE: psubd
  ; SSE: padd
  ; SSE: psrld $31
  ; SSE: psrad $2
author	Benjamin Kramer <benny.kra@googlemail.com>
	Sun, 27 Apr 2014 18:47:41 +0000 (18:47 +0000)
committer	Benjamin Kramer <benny.kra@googlemail.com>
	Sun, 27 Apr 2014 18:47:41 +0000 (18:47 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-idiv.ll		patch \| blob \| history