On Sandybridge split unaligned 256bit stores into two xmm-sized stores.

author Nadav Rotem <nrotem@apple.com>

Sat, 19 Jan 2013 08:38:41 +0000 (08:38 +0000)

committer Nadav Rotem <nrotem@apple.com>

Sat, 19 Jan 2013 08:38:41 +0000 (08:38 +0000)
author Nadav Rotem <nrotem@apple.com>
Sat, 19 Jan 2013 08:38:41 +0000 (08:38 +0000)
committer Nadav Rotem <nrotem@apple.com>
Sat, 19 Jan 2013 08:38:41 +0000 (08:38 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index b6b10e2dcab4f7882acc272612aae40775ac9cdc..ca8cd741e78fd4de0ec4ee9db94ceca0e29dde16 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16344,12 +16344,15 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
  
    ISD::LoadExtType Ext = Ld->getExtensionType();
    unsigned Alignment = Ld->getAlignment();
+  bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8;
  
    // On Sandybridge unaligned 256bit loads are inefficient.
    if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
-      !DCI.isBeforeLegalizeOps() && Alignment < 32 &&
-      Ext == ISD::NON_EXTLOAD) {
+      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
      unsigned NumElems = RegVT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
      SDValue Ptr = Ld->getBasePtr();
      SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
  
@@ -16363,7 +16366,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
                                  Ld->getPointerInfo(), Ld->isVolatile(),
                                  Ld->isNonTemporal(), Ld->isInvariant(),
-                                Alignment);
+                                std::max(Alignment/2U, 1U));
      SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                               Load1.getValue(1),
                               Load2.getValue(1));
@@ -16536,16 +16539,21 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    DebugLoc dl = St->getDebugLoc();
    SDValue StoredVal = St->getOperand(1);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned Alignment = St->getAlignment();
+  bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8;
  
    // If we are saving a concatenation of two XMM registers, perform two stores.
    // On Sandy Bridge, 256-bit memory operations are executed by two
    // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
    // memory  operation.
    if (VT.is256BitVector() && !Subtarget->hasInt256() &&
-      StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
-      StoredVal.getNumOperands() == 2) {
-    SDValue Value0 = StoredVal.getOperand(0);
-    SDValue Value1 = StoredVal.getOperand(1);
+      StVT == VT && !IsAligned) {
+    unsigned NumElems = VT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
+    SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
  
      SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
      SDValue Ptr0 = St->getBasePtr();
@@ -16553,10 +16561,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
  
      SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
                                  St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(), St->getAlignment());
+                                St->isNonTemporal(), Alignment);
      SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
                                  St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(), St->getAlignment());
+                                St->isNonTemporal(),
+                                std::max(Alignment/2U, 1U));
      return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
    }
  
diff --git a/test/CodeGen/X86/2012-01-11-split-cv.ll b/test/CodeGen/X86/2012-01-11-split-cv.ll

index 6b9007291901684ac8a462f20d6a4beab075ba5d..7e914984fe445431ae388720a1210a667c9c57b1 100644 (file)
--- a/test/CodeGen/X86/2012-01-11-split-cv.ll
+++ b/test/CodeGen/X86/2012-01-11-split-cv.ll
@@ -2,7 +2,7 @@
  
  ;CHECK: add18i16
  define void @add18i16(<18 x i16>* nocapture sret %ret, <18 x i16>* %bp) nounwind {
-;CHECK: vmovups
+;CHECK: vmovaps
    %b = load <18 x i16>* %bp, align 16
    %x = add <18 x i16> zeroinitializer, %b
    store <18 x i16> %x, <18 x i16>* %ret, align 16
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll

index 64825bac97190957714a98879ede22a61fe50891..52deadc792426e81406b730bf6a12a3853fb3e90 100644 (file)
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -42,7 +42,7 @@ define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwt
  
  ; Move the constants using a single vector store.
  ; CHECK: merge_const_store_vec
-; CHECK: vmovups  %ymm0, (%rsi)
+; CHECK: vmovups
  ; CHECK: ret
  define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
    %1 = icmp sgt i32 %count, 0
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll

index c9fc66a8a7917d6a48d025d47ad96aecad01082f..77a7c4f945f105886a8afa4c1f86717d20ab7ee8 100644 (file)
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -53,19 +53,24 @@ define void @storev16i16(<16 x i16> %a) nounwind {
    unreachable
  }
  
-; CHECK: vmovups  %ymm
+; CHECK: storev16i16_01
+; CHECK: vextractf128
+; CHECK: vmovaps  %xmm
  define void @storev16i16_01(<16 x i16> %a) nounwind {
    store <16 x i16> %a, <16 x i16>* undef, align 4
    unreachable
  }
  
+; CHECK: storev32i8
  ; CHECK: vmovaps  %ymm
  define void @storev32i8(<32 x i8> %a) nounwind {
    store <32 x i8> %a, <32 x i8>* undef, align 32
    unreachable
  }
  
-; CHECK: vmovups  %ymm
+; CHECK: storev32i8_01
+; CHECK: vextractf128
+; CHECK: vmovups  %xmm
  define void @storev32i8_01(<32 x i8> %a) nounwind {
    store <32 x i8> %a, <32 x i8>* undef, align 4
    unreachable
@@ -76,7 +81,7 @@ define void @storev32i8_01(<32 x i8> %a) nounwind {
  ; CHECK: _double_save
  ; CHECK-NOT: vinsertf128 $1
  ; CHECK-NOT: vinsertf128 $0
-; CHECK: vmovaps %xmm
+; CHECK: vmovups %xmm
  ; CHECK: vmovaps %xmm
  define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
  entry:
diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll

index 5201575f120290efbf486eadfa08e1a8dc6ab5fc..adee9bbe247781bb96f8fd07126dee6b4d1dac48 100755 (executable)
--- a/test/CodeGen/X86/avx-sext.ll
+++ b/test/CodeGen/X86/avx-sext.ll
@@ -186,18 +186,6 @@ define void @sext_4(<4 x i16>* %inbuf, <4 x i64>* %outbuf) {
    ret void                                                               
  }
  
-; AVX: sext_5
-; AVX: vpmovsxbw
-; AVX: vpmovsxwd
-; AVX: vpmovsxwd
-; AVX: vpmovsxdq
-; AVX: ret
-define void @sext_5(<8 x i8>* %inbuf, <8 x i64>* %outbuf) {
-  %v0 = load <8 x i8>* %inbuf
-  %r = sext <8 x i8> %v0 to <8 x i64>                                         
-  store <8 x i64> %r, <8 x i64>* %outbuf                                         
-  ret void                                                               
-}
  ; AVX: sext_6
  ; AVX: vpmovsxbw
  ; AVX: vpmovsxwd
diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll

index 2ae65c97d97ac32f612a6b5e2307a0067609fb31..a973befdafe7d1c1cd7967b2793ce68f7322b306 100644 (file)
--- a/test/CodeGen/X86/fp-load-trunc.ll
+++ b/test/CodeGen/X86/fp-load-trunc.ll
@@ -49,8 +49,8 @@ define <8 x float> @test4(<8 x double>* %p) nounwind {
  ; CHECK: movlhps
  ; CHECK: ret
  ; AVX:   test4
-; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
-; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   vcvtpd2psy
+; AVX:   vcvtpd2psy
  ; AVX:   vinsertf128
  ; AVX:   ret
    %x = load <8 x double>* %p
diff --git a/test/CodeGen/X86/sandybridge-loads.ll b/test/CodeGen/X86/sandybridge-loads.ll

index d85c32eaa7ec90419c9535ffc4f2e21faa83e12d..5a23cf136d852574a4fa40fe32800b400b6be3c7 100644 (file)
--- a/test/CodeGen/X86/sandybridge-loads.ll
+++ b/test/CodeGen/X86/sandybridge-loads.ll
@@ -3,7 +3,7 @@
  ;CHECK: wideloads
  ;CHECK: vmovaps
  ;CHECK: vinsertf128
-;CHECK: vmovups
+;CHECK: vmovaps
  ;CHECK-NOT: vinsertf128
  ;CHECK: ret
  
@@ -11,11 +11,29 @@ define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
    %v0 = load <8 x float>* %a, align 16  ; <---- unaligned!
    %v1 = load <8 x float>* %b, align 32  ; <---- aligned!
    %m0 = fcmp olt <8 x float> %v1, %v0
-  %v2 = load <8 x float>* %c, align 16
+  %v2 = load <8 x float>* %c, align 32  ; <---- aligned!
    %m1 = fcmp olt <8 x float> %v2, %v0
    %mand = and <8 x i1> %m1, %m0
    %r = zext <8 x i1> %mand to <8 x i32>
-  store <8 x i32> %r, <8 x i32>* undef, align 16
+  store <8 x i32> %r, <8 x i32>* undef, align 32
+  ret void
+}
+
+; CHECK: widestores
+; loads:
+; CHECK: vmovaps
+; CHECK: vmovaps
+; stores:
+; CHECK: vmovaps
+; CHECK: vextractf128
+; CHECK: vmovaps
+;CHECK: ret
+
+define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+  %v0 = load <8 x float>* %a, align 32
+  %v1 = load <8 x float>* %b, align 32
+  store <8 x float> %v0, <8 x float>* %b, align 32 ; <--- aligned
+  store <8 x float> %v1, <8 x float>* %a, align 16 ; <--- unaligned
    ret void
  }
  
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll

index ea231aff5b66263517fc34d222e1742e815786e2..8cbfb5d7243af16063fb3ee76676c4182b28bd2a 100644 (file)
--- a/test/CodeGen/X86/v8i1-masks.ll
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -6,7 +6,7 @@
  ;CHECK: vcmpltp
  ;CHECK: vandps
  ;CHECK: vandps
-;CHECK: vmovups
+;CHECK: vmovaps
  ;CHECK: ret
  
  define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
@@ -17,7 +17,7 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
    %m1 = fcmp olt <8 x float> %v2, %v0
    %mand = and <8 x i1> %m1, %m0
    %r = zext <8 x i1> %mand to <8 x i32>
-  store <8 x i32> %r, <8 x i32>* undef, align 16
+  store <8 x i32> %r, <8 x i32>* undef, align 32
    ret void
  }
  
@@ -25,7 +25,7 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
  ;CHECK: vcmpltps
  ;CHECK: vxorps
  ;CHECK: vandps
-;CHECK: vmovups
+;CHECK: vmovaps
  ;CHECK: ret
  define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
    %v0 = load <8 x float>* %a, align 16
@@ -33,7 +33,7 @@ define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
    %m0 = fcmp olt <8 x float> %v1, %v0
    %mand = xor <8 x i1> %m0, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
    %r = zext <8 x i1> %mand to <8 x i32>
-  store <8 x i32> %r, <8 x i32>* undef, align 16
+  store <8 x i32> %r, <8 x i32>* undef, align 32
    ret void
  }
  
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll

index dc0464ff9e0f5fab688ce136cc8bc382a287d464..e4a8f46cbc79f9a44047d332bc0e21d3bfa381eb 100644 (file)
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -29,8 +29,8 @@ entry:
  ; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
  ; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
  ; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
-; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
  ; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
    %0 = load <8 x float>* %in
    %1 = fpext <8 x float> %0 to <8 x double>
    store <8 x double> %1, <8 x double>* %out, align 1
author	Nadav Rotem <nrotem@apple.com>
	Sat, 19 Jan 2013 08:38:41 +0000 (08:38 +0000)
committer	Nadav Rotem <nrotem@apple.com>
	Sat, 19 Jan 2013 08:38:41 +0000 (08:38 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/2012-01-11-split-cv.ll		patch \| blob \| history
test/CodeGen/X86/MergeConsecutiveStores.ll		patch \| blob \| history
test/CodeGen/X86/avx-load-store.ll		patch \| blob \| history
test/CodeGen/X86/avx-sext.ll		patch \| blob \| history
test/CodeGen/X86/fp-load-trunc.ll		patch \| blob \| history
test/CodeGen/X86/sandybridge-loads.ll		patch \| blob \| history
test/CodeGen/X86/v8i1-masks.ll		patch \| blob \| history
test/CodeGen/X86/vec_fpext.ll		patch \| blob \| history