From 614061bfb4fea3c1233ecf2676282e063724ae93 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nadav.rotem@intel.com>
Date: Wed, 10 Aug 2011 19:30:14 +0000
Subject: [PATCH] When performing a truncating store, it is sometimes possible
 to rearrange the data in-register prior to saving to memory.  When we reorder
 the data in memory we prevent the need to save multiple scalars to memory,
 making a single regular store.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@137238 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp   | 81 +++++++++++++++++++++++++++-
 test/CodeGen/X86/opt-shuff-tstore.ll | 39 ++++++++++++++
 2 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/X86/opt-shuff-tstore.ll
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7e904e52525..d709bc09da9 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12574,14 +12574,91 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT VT = St->getValue().getValueType();
+  EVT StVT = St->getMemoryVT();
+  DebugLoc dl = St->getDebugLoc();
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.
+  // First, pack all of the elements in one place. Next, store to memory
+  // in fewer chunks.
+  if (St->isTruncatingStore() && VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+    // We are going to use the original vector elt for storing.
+    // accumulated smaller vector elements must be a multiple of bigger size.
+    if (0 != (NumElems * ToSz) % FromSz) return SDValue();
+    unsigned SizeRatio  = FromSz / ToSz;
+
+    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StVT.getScalarType(), NumElems*SizeRatio);
+
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec.data());
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+        StoreType = Tp;
+    }
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Ptr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(i));
+      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      Chains.push_back(Ch);
+    }
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());
+  }
+
+
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   // the FP state in cases where an emms may be missing.
   // A preferable solution to the general problem is to figure out the right
   // places to insert EMMS.  This qualifies as a quick hack.
 
   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
-  StoreSDNode *St = cast<StoreSDNode>(N);
-  EVT VT = St->getValue().getValueType();
   if (VT.getSizeInBits() != 64)
     return SDValue();
 
diff --git a/test/CodeGen/X86/opt-shuff-tstore.ll b/test/CodeGen/X86/opt-shuff-tstore.ll
new file mode 100644
index 00000000000..4596068d22c
--- /dev/null
+++ b/test/CodeGen/X86/opt-shuff-tstore.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mcpu=corei7 < %s -o - -promote-elements -mattr=+sse2,+sse41 | FileCheck %s
+
+; CHECK: func_4_8
+; A single memory write
+; CHECK: movd
+; CHECK-NEXT: ret
+define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
+  %r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4>
+  store <4 x i8> %r, <4 x i8>* %p
+  ret void
+}
+
+; CHECK: func_4_16
+; CHECK: movq
+; CHECK-NEXT: ret
+define void @func_4_16(<4 x i16> %param, <4 x i16>* %p) {
+  %r = add <4 x i16> %param, <i16 1, i16 2, i16 3, i16 4>
+  store <4 x i16> %r, <4 x i16>* %p
+  ret void
+}
+
+; CHECK: func_8_8
+; CHECK: movq
+; CHECK-NEXT: ret
+define void @func_8_8(<8 x i8> %param, <8 x i8>* %p) {
+  %r = add <8 x i8> %param, <i8 1, i8 2, i8 3, i8 4, i8 1, i8 2, i8 3, i8 4>
+  store <8 x i8> %r, <8 x i8>* %p
+  ret void
+}
+
+; CHECK: func_2_32
+; CHECK: movq
+; CHECK-NEXT: ret
+define void @func_2_32(<2 x i32> %param, <2 x i32>* %p) {
+  %r = add <2 x i32> %param, <i32 1, i32 2>
+  store <2 x i32> %r, <2 x i32>* %p
+  ret void
+}
+
-- 
2.34.1