[PowerPC 1/4] Little-endian adjustments for VSX loads/stores

author Bill Schmidt <wschmidt@linux.vnet.ibm.com>

Tue, 9 Dec 2014 16:35:51 +0000 (16:35 +0000)

committer Bill Schmidt <wschmidt@linux.vnet.ibm.com>

Tue, 9 Dec 2014 16:35:51 +0000 (16:35 +0000)
author Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Tue, 9 Dec 2014 16:35:51 +0000 (16:35 +0000)
committer Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Tue, 9 Dec 2014 16:35:51 +0000 (16:35 +0000)
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp

index 0bc10a1f6039a5abf288ba9118ef84cb9f234b58..fbcc34ee6557c598fa6e981547b355ca7cb1bbc3 100644 (file)
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -639,6 +639,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
      setTargetDAGCombine(ISD::BRCOND);
    setTargetDAGCombine(ISD::BSWAP);
    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
  
    setTargetDAGCombine(ISD::SIGN_EXTEND);
    setTargetDAGCombine(ISD::ZERO_EXTEND);
@@ -8301,6 +8303,105 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
                                   N->getOperand(0), ShiftCst), ShiftCst);
  }
  
+// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
+// builtins) into loads with swaps.
+SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue Chain;
+  SDValue Base;
+  MachineMemOperand *MMO;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode for little endian VSX load");
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    Chain = LD->getChain();
+    Base = LD->getBasePtr();
+    MMO = LD->getMemOperand();
+    // If the MMO suggests this isn't a load of a full vector, leave
+    // things alone.  For a built-in, we have to make the change for
+    // correctness, so if there is a size problem that will be a bug.
+    if (MMO->getSize() < 16)
+      return SDValue();
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
+    Chain = Intrin->getChain();
+    Base = Intrin->getBasePtr();
+    MMO = Intrin->getMemOperand();
+    break;
+  }
+  }
+
+  MVT VecTy = N->getValueType(0).getSimpleVT();
+  SDValue LoadOps[] = { Chain, Base };
+  SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
+                                         DAG.getVTList(VecTy, MVT::Other),
+                                         LoadOps, VecTy, MMO);
+  DCI.AddToWorklist(Load.getNode());
+  Chain = Load.getValue(1);
+  SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
+                             DAG.getVTList(VecTy, MVT::Other), Chain, Load);
+  DCI.AddToWorklist(Swap.getNode());
+  return Swap;
+}
+
+// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
+// builtins) into stores with swaps.
+SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue Chain;
+  SDValue Base;
+  unsigned SrcOpnd;
+  MachineMemOperand *MMO;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode for little endian VSX store");
+  case ISD::STORE: {
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    Chain = ST->getChain();
+    Base = ST->getBasePtr();
+    MMO = ST->getMemOperand();
+    SrcOpnd = 1;
+    // If the MMO suggests this isn't a store of a full vector, leave
+    // things alone.  For a built-in, we have to make the change for
+    // correctness, so if there is a size problem that will be a bug.
+    if (MMO->getSize() < 16)
+      return SDValue();
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
+    Chain = Intrin->getChain();
+    // Intrin->getBasePtr() oddly does not get what we want.
+    Base = Intrin->getOperand(3);
+    MMO = Intrin->getMemOperand();
+    SrcOpnd = 2;
+    break;
+  }
+  }
+
+  SDValue Src = N->getOperand(SrcOpnd);
+  MVT VecTy = Src.getValueType().getSimpleVT();
+  SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
+                             DAG.getVTList(VecTy, MVT::Other), Chain, Src);
+  DCI.AddToWorklist(Swap.getNode());
+  Chain = Swap.getValue(1);
+  SDValue StoreOps[] = { Chain, Swap, Base };
+  SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
+                                          DAG.getVTList(MVT::Other),
+                                          StoreOps, VecTy, MMO);
+  DCI.AddToWorklist(Store.getNode());
+  return Store;
+}
+
  SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
    const TargetMachine &TM = getTargetMachine();
@@ -8366,7 +8467,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
        }
      }
      break;
-  case ISD::STORE:
+  case ISD::STORE: {
      // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
      if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() &&
          !cast<StoreSDNode>(N)->isTruncatingStore() &&
@@ -8417,10 +8518,33 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                  Ops, cast<StoreSDNode>(N)->getMemoryVT(),
                                  cast<StoreSDNode>(N)->getMemOperand());
      }
+
+    // For little endian, VSX stores require generating xxswapd/lxvd2x.
+    EVT VT = N->getOperand(1).getValueType();
+    if (VT.isSimple()) {
+      MVT StoreVT = VT.getSimpleVT();
+      if (TM.getSubtarget<PPCSubtarget>().hasVSX() &&
+          TM.getSubtarget<PPCSubtarget>().isLittleEndian() &&
+          (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
+           StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
+        return expandVSXStoreForLE(N, DCI);
+    }
      break;
+  }
    case ISD::LOAD: {
      LoadSDNode *LD = cast<LoadSDNode>(N);
      EVT VT = LD->getValueType(0);
+
+    // For little endian, VSX loads require generating lxvd2x/xxswapd.
+    if (VT.isSimple()) {
+      MVT LoadVT = VT.getSimpleVT();
+      if (TM.getSubtarget<PPCSubtarget>().hasVSX() &&
+          TM.getSubtarget<PPCSubtarget>().isLittleEndian() &&
+          (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
+           LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
+        return expandVSXLoadForLE(N, DCI);
+    }
+
      Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
      unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
      if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
@@ -8569,6 +8693,34 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
      }
  
      break;
+  case ISD::INTRINSIC_W_CHAIN: {
+    // For little endian, VSX loads require generating lxvd2x/xxswapd.
+    if (TM.getSubtarget<PPCSubtarget>().hasVSX() &&
+        TM.getSubtarget<PPCSubtarget>().isLittleEndian()) {
+      switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+      default:
+        break;
+      case Intrinsic::ppc_vsx_lxvw4x:
+      case Intrinsic::ppc_vsx_lxvd2x:
+        return expandVSXLoadForLE(N, DCI);
+      }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    // For little endian, VSX stores require generating xxswapd/stxvd2x.
+    if (TM.getSubtarget<PPCSubtarget>().hasVSX() &&
+        TM.getSubtarget<PPCSubtarget>().isLittleEndian()) {
+      switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+      default:
+        break;
+      case Intrinsic::ppc_vsx_stxvw4x:
+      case Intrinsic::ppc_vsx_stxvd2x:
+        return expandVSXStoreForLE(N, DCI);
+      }
+    }
+    break;
+  }
    case ISD::BSWAP:
      // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
      if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h

index e9a71073f91b07f6b3f2f964e028944de3da92cf..9842f45613d05f252632714129474970e2c95348 100644 (file)
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -254,6 +254,13 @@ namespace llvm {
        /// operand identifies the operating system entry point.
        SC,
  
+      /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
+      /// endian.  Maps to an xxswapd instruction that corrects an lxvd2x
+      /// or stxvd2x instruction.  The chain is necessary because the
+      /// sequence replaces a load and needs to provide the same number
+      /// of outputs.
+      XXSWAPD,
+
        /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
        /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
        /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
@@ -293,7 +300,17 @@ namespace llvm {
        /// G8RC = ADDI_TOC_L G8RReg, Symbol - For medium code model, produces
        /// an ADDI8 instruction that adds G8RReg to sym\@toc\@l.
        /// Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
-      ADDI_TOC_L
+      ADDI_TOC_L,
+
+      /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
+      /// Maps directly to an lxvd2x instruction that will be followed by
+      /// an xxswapd.
+      LXVD2X,
+
+      /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
+      /// Maps directly to an stxvd2x instruction that will be preceded by
+      /// an xxswapd.
+      STXVD2X
      };
    }
  
@@ -403,6 +420,9 @@ namespace llvm {
      void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
                              SelectionDAG &DAG) const override;
  
+    SDValue expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const;
+
      SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
  
      unsigned getRegisterByName(const char* RegName, EVT VT) const override;
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td

index 6aff437a11d58c0431ea9d6992d5b89e4b729a18..d8003c9d8c89145c881a3ce853599d5e98449833 100644 (file)
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -25,6 +25,23 @@ def vsfrc : RegisterOperand<VSFRC> {
    let ParserMatchClass = PPCRegVSFRCAsmOperand;
  }
  
+// Little-endian-specific nodes.
+def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
+  SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCstxvd2x : SDTypeProfile<0, 2, [
+  SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCxxswapd : SDTypeProfile<1, 1, [
+  SDTCisSameAs<0, 1>
+]>;
+
+def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
+                        [SDNPHasChain, SDNPMayLoad]>;
+def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
+                        [SDNPHasChain, SDNPMayStore]>;
+def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
+
  multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
                      string asmbase, string asmstr, InstrItinClass itin,
                      list<dag> pattern> {
@@ -40,6 +57,9 @@ multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
  }
  
  def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
+def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">;
+def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">;
+
  let Predicates = [HasVSX] in {
  let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
  let hasSideEffects = 0 in { // VSX instructions don't have side effects.
@@ -854,11 +874,19 @@ def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
  def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
  def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
  def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
  
  // Stores.
  def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
  def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
  def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
+def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+
+// Permutes.
+def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
  
  // Selects.
  def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll

index 9eed623bacaa6ea1c55005a899d97bd3decb55ad..4edd8d59e5261fd260f788eca66873f8a45ae5c7 100644 (file)
--- a/test/CodeGen/PowerPC/ppc64le-aggregates.ll
+++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
@@ -1,4 +1,8 @@
-; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec -mattr=-vsx | FileCheck %s
+
+; Currently VSX support is disabled for this test because we generate lxsdx
+; instead of lfd, and stxsdx instead of stfd.  That is a poor choice when we
+; have reg+imm addressing, and is on the list of things to be fixed.
  
  target datalayout = "e-m:e-i64:64-n32:64"
  target triple = "powerpc64le-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/ppcf128-endian.ll b/test/CodeGen/PowerPC/ppcf128-endian.ll

index 2a5f13a5c3da3c1a99c240e3a829321916e4ed43..180fedf5c9f4c2a7f83d6d1308a9120bf7b63fbc 100644 (file)
--- a/test/CodeGen/PowerPC/ppcf128-endian.ll
+++ b/test/CodeGen/PowerPC/ppcf128-endian.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=pwr7 -mattr=+altivec < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mattr=+altivec -mattr=-vsx < %s | FileCheck %s
  
  target datalayout = "e-m:e-i64:64-n32:64"
  target triple = "powerpc64le-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/vec_misaligned.ll b/test/CodeGen/PowerPC/vec_misaligned.ll

index 73a4a4d395daf4227871ae6af2d099af0ecdfa18..49f11e4e2604a1ec8a3865fa50e96979c4289bcf 100644 (file)
--- a/test/CodeGen/PowerPC/vec_misaligned.ll
+++ b/test/CodeGen/PowerPC/vec_misaligned.ll
@@ -1,6 +1,6 @@
  ; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s
  ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mattr=-power8-vector | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mattr=-power8-vector | FileCheck %s -check-prefix=CHECK-LE
  
  target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
  target triple = "powerpc-apple-darwin8"
diff --git a/test/CodeGen/PowerPC/vec_shuffle_le.ll b/test/CodeGen/PowerPC/vec_shuffle_le.ll

index a4b2119f6ebc270ad0714c115697001a8e59f2c2..c7fc1c60c5eab3af93d9bdf28b8a141d689c00e7 100644 (file)
--- a/test/CodeGen/PowerPC/vec_shuffle_le.ll
+++ b/test/CodeGen/PowerPC/vec_shuffle_le.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mcpu=pwr7 | FileCheck %s
  
  define void @VPKUHUM_xy(<16 x i8>* %A, <16 x i8>* %B) {
  entry:
diff --git a/test/CodeGen/PowerPC/vsx-ldst.ll b/test/CodeGen/PowerPC/vsx-ldst.ll

index cf593b779b7bb2b8d8288371f4297fbeb1526187..b9d23d9c270f4c51661ec87002244cefe3036d1b 100644 (file)
--- a/test/CodeGen/PowerPC/vsx-ldst.ll
+++ b/test/CodeGen/PowerPC/vsx-ldst.ll
@@ -9,6 +9,14 @@
  ; RUN: grep stxvw4x < %t | count 3
  ; RUN: grep stxvd2x < %t | count 3
  
+;; Note: The LE test variant is disabled until LE support for VSX is enabled,
+;; as otherwise we fail to get the expected counts.
+
+; R;UN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64le-unknown-linux-gnu < %s > %t
+; R;UN: grep lxvd2x < %t | count 6
+; R;UN: grep stxvd2x < %t | count 6
+; R;UN: grep xxpermdi < %t | count 12
+
  @vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
  @vui = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
  @vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
author	Bill Schmidt <wschmidt@linux.vnet.ibm.com>
	Tue, 9 Dec 2014 16:35:51 +0000 (16:35 +0000)
committer	Bill Schmidt <wschmidt@linux.vnet.ibm.com>
	Tue, 9 Dec 2014 16:35:51 +0000 (16:35 +0000)
lib/Target/PowerPC/PPCISelLowering.cpp		patch \| blob \| history
lib/Target/PowerPC/PPCISelLowering.h		patch \| blob \| history
lib/Target/PowerPC/PPCInstrVSX.td		patch \| blob \| history
test/CodeGen/PowerPC/ppc64le-aggregates.ll		patch \| blob \| history
test/CodeGen/PowerPC/ppcf128-endian.ll		patch \| blob \| history
test/CodeGen/PowerPC/vec_misaligned.ll		patch \| blob \| history
test/CodeGen/PowerPC/vec_shuffle_le.ll		patch \| blob \| history
test/CodeGen/PowerPC/vsx-ldst.ll		patch \| blob \| history