; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; FIXME: use avx versions for punpcklbw, punpckhbw and punpckhwd
-; CHECK: vextractf128 $0
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: punpckhbw
+; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; CHECK-NEXT: vinsertf128 $1
-; CHECK-NEXT: vpermilps $85
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <32 x i8> %shuffle
}
-; CHECK: vextractf128 $0
-; CHECK-NEXT: punpckhwd
+; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11]
; CHECK-NEXT: vinsertf128 $1
-; CHECK-NEXT: vpermilps $85
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <16 x i16> %shuffle
}
-; CHECK: vmovd
+; CHECK: vmovq
+; CHECK-NEXT: vmovddup %xmm
; CHECK-NEXT: vinsertf128 $1
-; CHECK-NEXT: vpermilps $0
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
entry:
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
ret <4 x i64> %vecinit6.i
}
-; CHECK: vinsertf128 $1
-; CHECK-NEXT: vpermilps $0
+; CHECK: vmovddup %xmm
+; CHECK-NEXT: vinsertf128 $1
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
entry:
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
ret <4 x double> %vecinit6.i
}
-; Test this simple opt:
+; Test this turns into a broadcast:
; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
-; To:
-; shuffle (vload ptr)), undef, <1, 1, 1, 1>
-; CHECK: vmovaps
-; CHECK-NEXT: vinsertf128 $1
-; CHECK-NEXT: vpermilps $-1
+;
+; CHECK: vbroadcastss
define <8 x float> @funcE() nounwind {
allocas:
%udx495 = alloca [18 x [18 x float]], align 32
br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247
load.i1247: ; preds = %for_exit499
- %ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
+ %ptr1227 = getelementptr [18 x [18 x float]], [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
%ptr.i1237 = bitcast float* %ptr1227 to i32*
- %val.i1238 = load i32* %ptr.i1237, align 4
+ %val.i1238 = load i32, i32* %ptr.i1237, align 4
%ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6
%ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7
%phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>
ret <8 x float> %load_broadcast12281250
}
-; CHECK: vinsertf128 $1
-; CHECK-NEXT: vpermilps $0
+; CHECK: vpermilps $4
+; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcF(i32 %val) nounwind {
%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
ret <8 x float> %tmp
}
-; CHECK: vinsertf128 $1
-; CHECK-NEXT: vpermilps $0
+; CHECK: vpermilps $0
+; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
}
; CHECK: vextractf128 $1
+; CHECK-NEXT: vpermilps $85
; CHECK-NEXT: vinsertf128 $1
-; CHECK-NEXT: vpermilps $85
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>