Fix the alignment requirements of several unpck and shuf instructions.

author Dan Gohman <gohman@apple.com>

Thu, 2 Aug 2007 21:17:01 +0000 (21:17 +0000)

committer Dan Gohman <gohman@apple.com>

Thu, 2 Aug 2007 21:17:01 +0000 (21:17 +0000)
author Dan Gohman <gohman@apple.com>
Thu, 2 Aug 2007 21:17:01 +0000 (21:17 +0000)
committer Dan Gohman <gohman@apple.com>
Thu, 2 Aug 2007 21:17:01 +0000 (21:17 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 91da4c0b6c8970004b2aaa9b35df2665312ca0cd..a5eb00a60a51088d76064125da634adf3ef34af4 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1578,7 +1578,7 @@ static bool isUndefOrEqual(SDOperand Op, unsigned Val) {
  bool X86::isPSHUFDMask(SDNode *N) {
    assert(N->getOpcode() == ISD::BUILD_VECTOR);
  
-  if (N->getNumOperands() != 4)
+  if (N->getNumOperands() != 2 && N->getNumOperands() != 4)
      return false;
  
    // Check if the value doesn't reference the second vector.
@@ -1586,7 +1586,7 @@ bool X86::isPSHUFDMask(SDNode *N) {
      SDOperand Arg = N->getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    if (cast<ConstantSDNode>(Arg)->getValue() >= 4)
+    if (cast<ConstantSDNode>(Arg)->getValue() >= e)
        return false;
    }
  
@@ -2767,7 +2767,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
  
    // If VT is integer, try PSHUF* first, then SHUFP*.
    if (MVT::isInteger(VT)) {
-    if (X86::isPSHUFDMask(PermMask.Val) ||
+    // MMX doesn't have PSHUFD; it does have PSHUFW. While it's theoretically
+    // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
+    if (((MVT::getSizeInBits(VT) != 64 || NumElems == 4) &&
+         X86::isPSHUFDMask(PermMask.Val)) ||
          X86::isPSHUFHWMask(PermMask.Val) ||
          X86::isPSHUFLWMask(PermMask.Val)) {
        if (V2.getOpcode() != ISD::UNDEF)
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index 63065c95cf09b9eff063e36d7830b6b71e67f2aa..7ed69ea19e15edb544016376da0833cd16d79ab5 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -808,7 +808,7 @@ let isTwoAddress = 1 in {
                          "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (v4f32 (vector_shuffle
-                                  VR128:$src1, (load addr:$src2),
+                                  VR128:$src1, (memopv4f32 addr:$src2),
                                    SHUFP_shuffle_mask:$src3)))]>;
  
    let AddedComplexity = 10 in {
@@ -824,7 +824,7 @@ let isTwoAddress = 1 in {
                           "unpckhps\t{$src2, $dst|$dst, $src2}",
                           [(set VR128:$dst,
                             (v4f32 (vector_shuffle
-                                   VR128:$src1, (load addr:$src2),
+                                   VR128:$src1, (memopv4f32 addr:$src2),
                                     UNPCKH_shuffle_mask)))]>;
  
      def UNPCKLPSrr : PSI<0x14, MRMSrcReg, 
@@ -839,7 +839,7 @@ let isTwoAddress = 1 in {
                           "unpcklps\t{$src2, $dst|$dst, $src2}",
                           [(set VR128:$dst,
                             (v4f32 (vector_shuffle
-                                   VR128:$src1, (load addr:$src2),
+                                   VR128:$src1, (memopv4f32 addr:$src2),
                                     UNPCKL_shuffle_mask)))]>;
    } // AddedComplexity
  } // isTwoAddress
@@ -1561,7 +1561,7 @@ let isTwoAddress = 1 in {
                          "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (v2f64 (vector_shuffle
-                                  VR128:$src1, (load addr:$src2),
+                                  VR128:$src1, (memopv2f64 addr:$src2),
                                    SHUFP_shuffle_mask:$src3)))]>;
  
    let AddedComplexity = 10 in {
@@ -1577,7 +1577,7 @@ let isTwoAddress = 1 in {
                           "unpckhpd\t{$src2, $dst|$dst, $src2}",
                           [(set VR128:$dst,
                             (v2f64 (vector_shuffle
-                                   VR128:$src1, (load addr:$src2),
+                                   VR128:$src1, (memopv2f64 addr:$src2),
                                     UNPCKH_shuffle_mask)))]>;
  
      def UNPCKLPDrr : PDI<0x14, MRMSrcReg, 
@@ -1592,7 +1592,7 @@ let isTwoAddress = 1 in {
                           "unpcklpd\t{$src2, $dst|$dst, $src2}",
                           [(set VR128:$dst,
                             (v2f64 (vector_shuffle
-                                   VR128:$src1, (load addr:$src2),
+                                   VR128:$src1, (memopv2f64 addr:$src2),
                                     UNPCKL_shuffle_mask)))]>;
    } // AddedComplexity
  } // isTwoAddress
@@ -1782,7 +1782,7 @@ let isTwoAddress = 1 in {
                      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                      "pandn\t{$src2, $dst|$dst, $src2}",
                      [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
-                                              (load addr:$src2))))]>;
+                                              (memopv2i64 addr:$src2))))]>;
  }
  
  // SSE2 Integer comparison
@@ -2419,6 +2419,11 @@ def : Pat<(vector_shuffle (v4f32 VR128:$src1), (undef),
             SHUFP_unary_shuffle_mask:$sm),
            (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
        Requires<[HasSSE1]>;
+// Special unary SHUFPDrri case.
+def : Pat<(vector_shuffle (v2f64 VR128:$src1), (undef),
+           SHUFP_unary_shuffle_mask:$sm),
+          (SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
+      Requires<[HasSSE2]>;
  // Unary v4f32 shuffle with PSHUF* in order to fold a load.
  def : Pat<(vector_shuffle (memopv4f32 addr:$src1), (undef),
             SHUFP_unary_shuffle_mask:$sm),
@@ -2583,13 +2588,13 @@ def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
            (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
  
  def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
-                  (load addr:$src2))),
+                  (memopv2i64 addr:$src2))),
            (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
  def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
-                  (load addr:$src2))),
+                  (memopv2i64 addr:$src2))),
            (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
  def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
-                  (load addr:$src2))),
+                  (memopv2i64 addr:$src2))),
            (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
  
  // Use movaps / movups for SSE integer load / store (one byte shorter).
diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll

new file mode 100644 (file)

index 0000000..731d429
--- /dev/null
+++ b/test/CodeGen/X86/sse-align-12.ll
@@ -0,0 +1,50 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | grep unpck | wc -l | grep 2
+; RUN: llvm-as < %s | llc -march=x86-64 | grep shuf  | wc -l | grep 2
+; RUN: llvm-as < %s | llc -march=x86-64 | grep ps    | wc -l | grep 4
+; RUN: llvm-as < %s | llc -march=x86-64 | grep pd    | wc -l | grep 4
+; RUN: llvm-as < %s | llc -march=x86-64 | grep movup | wc -l | grep 4
+
+define <4 x float> @a(<4 x float>* %y)
+{
+  %x = load <4 x float>* %y, align 4
+  %a = extractelement <4 x float> %x, i32 0
+  %b = extractelement <4 x float> %x, i32 1
+  %c = extractelement <4 x float> %x, i32 2
+  %d = extractelement <4 x float> %x, i32 3
+  %p = insertelement <4 x float> undef, float %d, i32 0
+  %q = insertelement <4 x float> %p, float %c, i32 1
+  %r = insertelement <4 x float> %q, float %b, i32 2
+  %s = insertelement <4 x float> %r, float %a, i32 3
+  ret <4 x float> %s
+}
+define <4 x float> @b(<4 x float>* %y, <4 x float> %z)
+{
+  %x = load <4 x float>* %y, align 4
+  %a = extractelement <4 x float> %x, i32 2
+  %b = extractelement <4 x float> %x, i32 3
+  %c = extractelement <4 x float> %z, i32 2
+  %d = extractelement <4 x float> %z, i32 3
+  %p = insertelement <4 x float> undef, float %c, i32 0
+  %q = insertelement <4 x float> %p, float %a, i32 1
+  %r = insertelement <4 x float> %q, float %d, i32 2
+  %s = insertelement <4 x float> %r, float %b, i32 3
+  ret <4 x float> %s
+}
+define <2 x double> @c(<2 x double>* %y)
+{
+  %x = load <2 x double>* %y, align 8
+  %a = extractelement <2 x double> %x, i32 0
+  %c = extractelement <2 x double> %x, i32 1
+  %p = insertelement <2 x double> undef, double %c, i32 0
+  %r = insertelement <2 x double> %p, double %a, i32 1
+  ret <2 x double> %r
+}
+define <2 x double> @d(<2 x double>* %y, <2 x double> %z)
+{
+  %x = load <2 x double>* %y, align 8
+  %a = extractelement <2 x double> %x, i32 1
+  %c = extractelement <2 x double> %z, i32 1
+  %p = insertelement <2 x double> undef, double %c, i32 0
+  %r = insertelement <2 x double> %p, double %a, i32 1
+  ret <2 x double> %r
+}
author	Dan Gohman <gohman@apple.com>
	Thu, 2 Aug 2007 21:17:01 +0000 (21:17 +0000)
committer	Dan Gohman <gohman@apple.com>
	Thu, 2 Aug 2007 21:17:01 +0000 (21:17 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
test/CodeGen/X86/sse-align-12.ll	[new file with mode: 0644]	patch \| blob