; CHECK: vsubpd (%
define <4 x double> @subpd256fold(<4 x double> %y, <4 x double>* nocapture %x) nounwind uwtable readonly ssp {
entry:
- %tmp2 = load <4 x double>* %x, align 32
+ %tmp2 = load <4 x double>, <4 x double>* %x, align 32
%sub.i = fsub <4 x double> %y, %tmp2
ret <4 x double> %sub.i
}
; CHECK: vsubps (%
define <8 x float> @subps256fold(<8 x float> %y, <8 x float>* nocapture %x) nounwind uwtable readonly ssp {
entry:
- %tmp2 = load <8 x float>* %x, align 32
+ %tmp2 = load <8 x float>, <8 x float>* %x, align 32
%sub.i = fsub <8 x float> %y, %tmp2
ret <8 x float> %sub.i
}
}
declare float @sqrtf(float) readnone
+
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpaddq %xmm
+; CHECK-NEXT: vpaddq %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <4 x i64> @vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+ %x = add <4 x i64> %i, %j
+ ret <4 x i64> %x
+}
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpaddd %xmm
+; CHECK-NEXT: vpaddd %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <8 x i32> @vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+ %x = add <8 x i32> %i, %j
+ ret <8 x i32> %x
+}
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpaddw %xmm
+; CHECK-NEXT: vpaddw %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <16 x i16> @vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+ %x = add <16 x i16> %i, %j
+ ret <16 x i16> %x
+}
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpaddb %xmm
+; CHECK-NEXT: vpaddb %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <32 x i8> @vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+ %x = add <32 x i8> %i, %j
+ ret <32 x i8> %x
+}
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpsubq %xmm
+; CHECK-NEXT: vpsubq %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <4 x i64> @vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+ %x = sub <4 x i64> %i, %j
+ ret <4 x i64> %x
+}
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpsubd %xmm
+; CHECK-NEXT: vpsubd %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <8 x i32> @vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+ %x = sub <8 x i32> %i, %j
+ ret <8 x i32> %x
+}
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpsubw %xmm
+; CHECK-NEXT: vpsubw %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <16 x i16> @vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+ %x = sub <16 x i16> %i, %j
+ ret <16 x i16> %x
+}
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpsubb %xmm
+; CHECK-NEXT: vpsubb %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <32 x i8> @vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+ %x = sub <32 x i8> %i, %j
+ ret <32 x i8> %x
+}
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpmulld %xmm
+; CHECK-NEXT: vpmulld %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <8 x i32> @vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+ %x = mul <8 x i32> %i, %j
+ ret <8 x i32> %x
+}
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpmullw %xmm
+; CHECK-NEXT: vpmullw %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+ %x = mul <16 x i16> %i, %j
+ ret <16 x i16> %x
+}
+
+; CHECK: vextractf128 $1
+; CHECK-NEXT: vextractf128 $1
+; CHECK-NEXT: vpmuludq %xmm
+; CHECK-NEXT: vpsrlq $32, %xmm
+; CHECK-NEXT: vpmuludq %xmm
+; CHECK-NEXT: vpsllq $32, %xmm
+; CHECK-NEXT: vpaddq %xmm
+; CHECK-NEXT: vpsrlq $32, %xmm
+; CHECK-NEXT: vpmuludq %xmm
+; CHECK-NEXT: vpsllq $32, %xmm
+; CHECK-NEXT: vpaddq %xmm
+; CHECK-NEXT: vpmuludq %xmm
+; CHECK-NEXT: vpsrlq $32, %xmm
+; CHECK-NEXT: vpmuludq %xmm
+; CHECK-NEXT: vpsllq $32, %xmm
+; CHECK-NEXT: vpaddq %xmm
+; CHECK-NEXT: vpsrlq $32, %xmm
+; CHECK-NEXT: vpmuludq %xmm
+; CHECK-NEXT: vpsllq $32, %xmm
+; CHECK-NEXT: vpaddq %xmm
+; CHECK-NEXT: vinsertf128 $1
+define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+ %x = mul <4 x i64> %i, %j
+ ret <4 x i64> %x
+}
+
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define <4 x float> @int_sqrt_ss() {
+; CHECK: int_sqrt_ss
+; CHECK: vsqrtss
+ %x0 = load float, float addrspace(1)* undef, align 8
+ %x1 = insertelement <4 x float> undef, float %x0, i32 0
+ %x2 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %x1) nounwind
+ ret <4 x float> %x2
+}