[PPC64LE] Generate correct code for unaligned little-endian vector loads

author Bill Schmidt <wschmidt@linux.vnet.ibm.com>

Mon, 9 Jun 2014 22:00:52 +0000 (22:00 +0000)

committer Bill Schmidt <wschmidt@linux.vnet.ibm.com>

Mon, 9 Jun 2014 22:00:52 +0000 (22:00 +0000)
author Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Mon, 9 Jun 2014 22:00:52 +0000 (22:00 +0000)
committer Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Mon, 9 Jun 2014 22:00:52 +0000 (22:00 +0000)
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp

index 91366e2ccebfbbb3bc2e0f44fa41d72cdfc0940f..b7d7aaa5a88dcdc399eae914572412ebd5771608 100644 (file)
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -8095,6 +8095,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
        // This is a type-legal unaligned Altivec load.
        SDValue Chain = LD->getChain();
        SDValue Ptr = LD->getBasePtr();
+      bool isLittleEndian = PPCSubTarget.isLittleEndian();
  
        // This implements the loading of unaligned vectors as described in
        // the venerable Apple Velocity Engine overview. Specifically:
@@ -8102,25 +8103,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
        // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
        //
        // The general idea is to expand a sequence of one or more unaligned
-      // loads into a alignment-based permutation-control instruction (lvsl),
-      // a series of regular vector loads (which always truncate their
-      // input address to an aligned address), and a series of permutations.
-      // The results of these permutations are the requested loaded values.
-      // The trick is that the last "extra" load is not taken from the address
-      // you might suspect (sizeof(vector) bytes after the last requested
-      // load), but rather sizeof(vector) - 1 bytes after the last
-      // requested vector. The point of this is to avoid a page fault if the
-      // base address happened to be aligned. This works because if the base
-      // address is aligned, then adding less than a full vector length will
-      // cause the last vector in the sequence to be (re)loaded. Otherwise,
-      // the next vector will be fetched as you might suspect was necessary.
+      // loads into an alignment-based permutation-control instruction (lvsl
+      // or lvsr), a series of regular vector loads (which always truncate
+      // their input address to an aligned address), and a series of
+      // permutations.  The results of these permutations are the requested
+      // loaded values.  The trick is that the last "extra" load is not taken
+      // from the address you might suspect (sizeof(vector) bytes after the
+      // last requested load), but rather sizeof(vector) - 1 bytes after the
+      // last requested vector. The point of this is to avoid a page fault if
+      // the base address happened to be aligned. This works because if the
+      // base address is aligned, then adding less than a full vector length
+      // will cause the last vector in the sequence to be (re)loaded.
+      // Otherwise, the next vector will be fetched as you might suspect was
+      // necessary.
  
        // We might be able to reuse the permutation generation from
        // a different base address offset from this one by an aligned amount.
        // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
        // optimization later.
-      SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr,
-                                          DAG, dl, MVT::v16i8);
+      Intrinsic::ID Intr = (isLittleEndian ?
+                            Intrinsic::ppc_altivec_lvsr :
+                            Intrinsic::ppc_altivec_lvsl);
+      SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8);
  
        // Refine the alignment of the original load (a "new" load created here
        // which was identical to the first except for the alignment would be
@@ -8169,8 +8173,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
        if (ExtraLoad.getValueType() != MVT::v4i32)
          ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);
  
-      SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
-                                      BaseLoad, ExtraLoad, PermCntl, DAG, dl);
+      // Because vperm has a big-endian bias, we must reverse the order
+      // of the input vectors and complement the permute control vector
+      // when generating little endian code.  We have already handled the
+      // latter by using lvsr instead of lvsl, so just reverse BaseLoad
+      // and ExtraLoad here.
+      SDValue Perm;
+      if (isLittleEndian)
+        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                ExtraLoad, BaseLoad, PermCntl, DAG, dl);
+      else
+        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                BaseLoad, ExtraLoad, PermCntl, DAG, dl);
  
        if (VT != MVT::v4i32)
          Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
@@ -8210,9 +8224,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
      }
      }
      break;
-  case ISD::INTRINSIC_WO_CHAIN:
-    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() ==
-          Intrinsic::ppc_altivec_lvsl &&
+  case ISD::INTRINSIC_WO_CHAIN: {
+    bool isLittleEndian = PPCSubTarget.isLittleEndian();
+    Intrinsic::ID Intr = (isLittleEndian ?
+                          Intrinsic::ppc_altivec_lvsr :
+                          Intrinsic::ppc_altivec_lvsl);
+    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == Intr &&
          N->getOperand(1)->getOpcode() == ISD::ADD) {
        SDValue Add = N->getOperand(1);
  
@@ -8224,8 +8241,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
               UE = BasePtr->use_end(); UI != UE; ++UI) {
            if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
                cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
-                Intrinsic::ppc_altivec_lvsl) {
-            // We've found another LVSL, and this address if an aligned
+                Intr) {
+            // We've found another LVSL/LVSR, and this address is an aligned
              // multiple of that one. The results will be the same, so use the
              // one we've just found instead.
  
@@ -8234,6 +8251,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
          }
        }
      }
+    }
  
      break;
    case ISD::BSWAP:
diff --git a/test/CodeGen/PowerPC/vec_misaligned.ll b/test/CodeGen/PowerPC/vec_misaligned.ll

index d7ed64a5b1cfcfa2fc799e5873de2b2fbbc22bdb..304a84d49a9da82dc3cddace2b7172ba05cc5558 100644 (file)
--- a/test/CodeGen/PowerPC/vec_misaligned.ll
+++ b/test/CodeGen/PowerPC/vec_misaligned.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=ppc32 -mcpu=g5
+; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
  
  target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
  target triple = "powerpc-apple-darwin8"
@@ -8,6 +10,8 @@ target triple = "powerpc-apple-darwin8"
  
  define void @foo(i32 %x, ...) {
  entry:
+; CHECK: foo:
+; CHECK-LE: foo:
         %x_addr = alloca i32            ; <i32*> [#uses=1]
         %ap = alloca i8*                ; <i8**> [#uses=3]
         %ap.0 = alloca i8*              ; <i8**> [#uses=3]
@@ -27,6 +31,10 @@ entry:
         %tmp8 = getelementptr %struct.u16qi* %tmp6, i32 0, i32 0                ; <<16 x i8>*> [#uses=1]
         %tmp9 = getelementptr %struct.u16qi* %tmp7, i32 0, i32 0                ; <<16 x i8>*> [#uses=1]
         %tmp10 = load <16 x i8>* %tmp9, align 4         ; <<16 x i8>> [#uses=1]
+; CHECK: lvsl
+; CHECK: vperm
+; CHECK-LE: lvsr
+; CHECK-LE: vperm
         store <16 x i8> %tmp10, <16 x i8>* %tmp8, align 4
         br label %return
author	Bill Schmidt <wschmidt@linux.vnet.ibm.com>
	Mon, 9 Jun 2014 22:00:52 +0000 (22:00 +0000)
committer	Bill Schmidt <wschmidt@linux.vnet.ibm.com>
	Mon, 9 Jun 2014 22:00:52 +0000 (22:00 +0000)
lib/Target/PowerPC/PPCISelLowering.cpp		patch \| blob \| history
test/CodeGen/PowerPC/vec_misaligned.ll		patch \| blob \| history