Test

[oota-llvm.git] / test / CodeGen / X86 / load-slice.ll
diff --git a/test/CodeGen/X86/load-slice.ll b/test/CodeGen/X86/load-slice.ll

index 8086def0e366c598dc1bf604c43126187f796e58..2f90f819d47e02c0c4df5d70d7fa99166313b29e 100644 (file)
--- a/test/CodeGen/X86/load-slice.ll
+++ b/test/CodeGen/X86/load-slice.ll
@@ -1,12 +1,12 @@
-; RUN: llc -mtriple x86_64-apple-macosx -mattr=+sse4.2 -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
-; RUN: llc -mtriple x86_64-apple-macosx -mattr=+sse4.2 < %s -o - | FileCheck %s --check-prefix=REGULAR
+; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
+; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR
  ;
  ; <rdar://problem/14477220>
  
  %class.Complex = type { float, float }
  
  
-; Check that independant slices leads to independant loads then the slices leads to
+; Check that independent slices leads to independent loads then the slices leads to
  ; different register file.
  ;
  ; The layout is:
@@ -17,51 +17,51 @@
  ; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
  ;
  ; STRESS-LABEL: t1:
-; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
-; STRESS: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]]
-; Add high slice: out[out_start].imm, this is base + 4.
-; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
  ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
-; STRESS-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]]
+; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
  ; Add low slice: out[out_start].real, this is base + 0.
  ; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
+; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
+; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
+; Add high slice: out[out_start].imm, this is base + 4.
+; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
  ; Swap Imm and Real.
  ; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
  ; Put the results back into out[out_start].
-; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]])
+; STRESS-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
  ;
  ; Same for REGULAR, we eliminate register bank copy with each slices.
  ; REGULAR-LABEL: t1:
-; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
-; REGULAR: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]]
-; Add high slice: out[out_start].imm, this is base + 4.
-; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
  ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
-; REGULAR-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]]
+; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
  ; Add low slice: out[out_start].real, this is base + 0.
  ; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
+; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
+; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
+; Add high slice: out[out_start].imm, this is base + 4.
+; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
  ; Swap Imm and Real.
  ; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
  ; Put the results back into out[out_start].
-; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]])
+; REGULAR-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
  define void @t1(%class.Complex* nocapture %out, i64 %out_start) {
  entry:
-  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
    %tmp = bitcast %class.Complex* %arrayidx to i64*
-  %tmp1 = load i64* %tmp, align 8
+  %tmp1 = load i64, i64* %tmp, align 8
    %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
    %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
    %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
    %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
    %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
    %add = add i64 %out_start, 8
-  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
-  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
-  %tmp4 = load float* %i.i, align 4
+  %arrayidx2 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex, %class.Complex* %arrayidx2, i64 0, i32 0
+  %tmp4 = load float, float* %i.i, align 4
    %add.i = fadd float %tmp4, %tmp2
    %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
-  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
-  %tmp5 = load float* %r.i, align 4
+  %r.i = getelementptr inbounds %class.Complex, %class.Complex* %arrayidx2, i64 0, i32 1
+  %tmp5 = load float, float* %r.i, align 4
    %add5.i = fadd float %tmp5, %tmp3
    %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
    %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
@@ -100,9 +100,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture)
  ; REGULAR-LABEL: t2:
  ; REGULAR: shrq $48
  define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
-  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
    %bitcast = bitcast %class.Complex* %arrayidx to i64*
-  %chunk64 = load i64* %bitcast, align 8
+  %chunk64 = load i64, i64* %bitcast, align 8
    %slice32_low = trunc i64 %chunk64 to i32
    %shift48 = lshr i64 %chunk64, 48
    %slice32_high = trunc i64 %shift48 to i32
@@ -125,9 +125,9 @@ define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
  ; REGULAR: shrq $48
  ; REGULAR: shrq $32
  define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
-  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
    %bitcast = bitcast %class.Complex* %arrayidx to i64*
-  %chunk64 = load i64* %bitcast, align 8
+  %chunk64 = load i64, i64* %bitcast, align 8
    %slice32_low = trunc i64 %chunk64 to i32
    %shift48 = lshr i64 %chunk64, 48
    %slice32_high = trunc i64 %shift48 to i32
@@ -137,4 +137,3 @@ define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
    %res = add i32 %slice32_lowhigh, %tmpres
    ret i32 %res
  }
-