On Haswell, perfer storing YMM registers using a single instruction.

author Nadav Rotem <nadav.rotem@intel.com>

Sat, 19 May 2012 20:30:08 +0000 (20:30 +0000)

committer Nadav Rotem <nadav.rotem@intel.com>

Sat, 19 May 2012 20:30:08 +0000 (20:30 +0000)
author Nadav Rotem <nadav.rotem@intel.com>
Sat, 19 May 2012 20:30:08 +0000 (20:30 +0000)
committer Nadav Rotem <nadav.rotem@intel.com>
Sat, 19 May 2012 20:30:08 +0000 (20:30 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 203c8733aae6b1a5f75a69111f2ed0395be599ef..2810f4200da382e3635700de3e31c3d187f09000 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -14532,13 +14532,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
    // If we are saving a concatenation of two XMM registers, perform two stores.
-  // This is better in Sandy Bridge cause one 256-bit mem op is done via two
-  // 128-bit ones. If in the future the cost becomes only one memory access the
-  // first version would be better.
-  if (VT.getSizeInBits() == 256 &&
+  // On Sandy Bridge, 256-bit memory operations are executed by two
+  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
+  // memory  operation.
+  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2() &&
        StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
        StoredVal.getNumOperands() == 2) {
-
      SDValue Value0 = StoredVal.getOperand(0);
      SDValue Value1 = StoredVal.getOperand(1);
  
diff --git a/test/CodeGen/X86/2012-05-19-avx2-store.ll b/test/CodeGen/X86/2012-05-19-avx2-store.ll

new file mode 100644 (file)

index 0000000..61fef90
--- /dev/null
+++ b/test/CodeGen/X86/2012-05-19-avx2-store.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
+
+define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp {
+entry:
+  ; CHECK: vmovaps
+  ; CHECK: vmovaps
+  ; CHECK: vinsertf128
+  ; CHECK: vmovups
+  %A = load <4 x i32>* %Ap
+  %B = load <4 x i32>* %Bp
+  %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %Z, <8 x i32>* %P, align 16
+  ret void
+}
author	Nadav Rotem <nadav.rotem@intel.com>
	Sat, 19 May 2012 20:30:08 +0000 (20:30 +0000)
committer	Nadav Rotem <nadav.rotem@intel.com>
	Sat, 19 May 2012 20:30:08 +0000 (20:30 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/2012-05-19-avx2-store.ll	[new file with mode: 0644]	patch \| blob