// Misc.
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse2_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128">,
- Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
+ Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty,
llvm_v8i16_ty], [IntrNoMem]>;
def int_x86_sse2_packssdw_128 : GCCBuiltin<"__builtin_ia32_packssdw128">,
- Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
+ Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty,
llvm_v4i32_ty], [IntrNoMem]>;
def int_x86_sse2_packuswb_128 : GCCBuiltin<"__builtin_ia32_packuswb128">,
- Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
+ Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty,
llvm_v8i16_ty], [IntrNoMem]>;
def int_x86_sse2_movmsk_pd : GCCBuiltin<"__builtin_ia32_movmskpd">,
Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8)
-declare <4 x i32> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>)
%tmp337 = bitcast <4 x i32> %tmp336 to <4 x float> ; <<4 x float>> [#uses=1]
%tmp378 = tail call <4 x float> @llvm.x86.sse.cmp.ps( <4 x float> %tmp337, <4 x float> zeroinitializer, i8 1 ) ; <<4 x float>> [#uses=1]
%tmp379 = bitcast <4 x float> %tmp378 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp388 = tail call <4 x i32> @llvm.x86.sse2.packssdw.128( <4 x i32> zeroinitializer, <4 x i32> %tmp379 ) ; <<4 x i32>> [#uses=1]
- %tmp392 = bitcast <4 x i32> %tmp388 to <8 x i16> ; <<8 x i16>> [#uses=1]
+ %tmp388 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128( <4 x i32> zeroinitializer, <4 x i32> %tmp379 ) ; <<4 x i32>> [#uses=1]
+ %tmp392 = bitcast <8 x i16> %tmp388 to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp399 = extractelement <8 x i16> %tmp392, i32 7 ; <i16> [#uses=1]
%tmp423 = insertelement <8 x i16> zeroinitializer, i16 %tmp399, i32 7 ; <<8 x i16>> [#uses=1]
%tmp427 = bitcast <8 x i16> %tmp423 to <16 x i8> ; <<16 x i8>> [#uses=1]
%tmp75 = bitcast <4 x float> %tmp74 to <4 x i32> ; <<4 x i32>> [#uses=1]
%tmp88 = tail call <4 x float> @llvm.x86.sse.cmp.ps( <4 x float> %tmp44, <4 x float> %tmp61, i8 1 ) ; <<4 x float>> [#uses=1]
%tmp89 = bitcast <4 x float> %tmp88 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp98 = tail call <4 x i32> @llvm.x86.sse2.packssdw.128( <4 x i32> %tmp75, <4 x i32> %tmp89 ) ; <<4 x i32>> [#uses=1]
- %tmp102 = bitcast <4 x i32> %tmp98 to <8 x i16> ; <<8 x i16>> [#uses=1]
+ %tmp98 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128( <4 x i32> %tmp75, <4 x i32> %tmp89 ) ; <<4 x i32>> [#uses=1]
+ %tmp102 = bitcast <8 x i16> %tmp98 to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp.upgrd.1 = shufflevector <8 x i16> %tmp102, <8 x i16> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 4, i32 7 > ; <<8 x i16>> [#uses=1]
%tmp105 = shufflevector <8 x i16> %tmp.upgrd.1, <8 x i16> undef, <8 x i32> < i32 2, i32 1, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7 > ; <<8 x i16>> [#uses=1]
%tmp105.upgrd.2 = bitcast <8 x i16> %tmp105 to <4 x float> ; <<4 x float>> [#uses=1]
%tmp134 = bitcast <4 x float> %tmp133 to <4 x i32> ; <<4 x i32>> [#uses=1]
%tmp147 = tail call <4 x float> @llvm.x86.sse.cmp.ps( <4 x float> %tmp44, <4 x float> %tmp120, i8 1 ) ; <<4 x float>> [#uses=1]
%tmp148 = bitcast <4 x float> %tmp147 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp159 = tail call <4 x i32> @llvm.x86.sse2.packssdw.128( <4 x i32> %tmp134, <4 x i32> %tmp148 ) ; <<4 x i32>> [#uses=1]
- %tmp163 = bitcast <4 x i32> %tmp159 to <8 x i16> ; <<8 x i16>> [#uses=1]
+ %tmp159 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128( <4 x i32> %tmp134, <4 x i32> %tmp148 ) ; <<4 x i32>> [#uses=1]
+ %tmp163 = bitcast <8 x i16> %tmp159 to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp164 = shufflevector <8 x i16> %tmp163, <8 x i16> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 4, i32 7 > ; <<8 x i16>> [#uses=1]
%tmp166 = shufflevector <8 x i16> %tmp164, <8 x i16> undef, <8 x i32> < i32 2, i32 1, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7 > ; <<8 x i16>> [#uses=1]
%tmp166.upgrd.4 = bitcast <8 x i16> %tmp166 to <4 x float> ; <<4 x float>> [#uses=1]
%tmp195 = bitcast <4 x float> %tmp194 to <4 x i32> ; <<4 x i32>> [#uses=1]
%tmp208 = tail call <4 x float> @llvm.x86.sse.cmp.ps( <4 x float> %tmp44, <4 x float> %tmp181, i8 1 ) ; <<4 x float>> [#uses=1]
%tmp209 = bitcast <4 x float> %tmp208 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp220 = tail call <4 x i32> @llvm.x86.sse2.packssdw.128( <4 x i32> %tmp195, <4 x i32> %tmp209 ) ; <<4 x i32>> [#uses=1]
- %tmp224 = bitcast <4 x i32> %tmp220 to <8 x i16> ; <<8 x i16>> [#uses=1]
+ %tmp220 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128( <4 x i32> %tmp195, <4 x i32> %tmp209 ) ; <<4 x i32>> [#uses=1]
+ %tmp224 = bitcast <8 x i16> %tmp220 to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp225 = shufflevector <8 x i16> %tmp224, <8 x i16> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 4, i32 7 > ; <<8 x i16>> [#uses=1]
%tmp227 = shufflevector <8 x i16> %tmp225, <8 x i16> undef, <8 x i32> < i32 2, i32 1, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7 > ; <<8 x i16>> [#uses=1]
%tmp227.upgrd.6 = bitcast <8 x i16> %tmp227 to <4 x float> ; <<4 x float>> [#uses=1]
%tmp256 = bitcast <4 x float> %tmp255 to <4 x i32> ; <<4 x i32>> [#uses=1]
%tmp269 = tail call <4 x float> @llvm.x86.sse.cmp.ps( <4 x float> %tmp44, <4 x float> %tmp242, i8 1 ) ; <<4 x float>> [#uses=1]
%tmp270 = bitcast <4 x float> %tmp269 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp281 = tail call <4 x i32> @llvm.x86.sse2.packssdw.128( <4 x i32> %tmp256, <4 x i32> %tmp270 ) ; <<4 x i32>> [#uses=1]
- %tmp285 = bitcast <4 x i32> %tmp281 to <8 x i16> ; <<8 x i16>> [#uses=1]
+ %tmp281 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128( <4 x i32> %tmp256, <4 x i32> %tmp270 ) ; <<4 x i32>> [#uses=1]
+ %tmp285 = bitcast <8 x i16> %tmp281 to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp286 = shufflevector <8 x i16> %tmp285, <8 x i16> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 4, i32 7 > ; <<8 x i16>> [#uses=1]
%tmp288 = shufflevector <8 x i16> %tmp286, <8 x i16> undef, <8 x i32> < i32 2, i32 1, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7 > ; <<8 x i16>> [#uses=1]
%tmp288.upgrd.8 = bitcast <8 x i16> %tmp288 to <4 x float> ; <<4 x float>> [#uses=1]
declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8)
-declare <4 x i32> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
; RUN: llc < %s -march=x86 -mattr=+sse2
; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep punpckhwd
-declare <8 x i16> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>)
%tmp805 = add <4 x i32> %tmp777, zeroinitializer
%tmp832 = bitcast <4 x i32> %tmp805 to <8 x i16>
%tmp838 = tail call <8 x i16> @llvm.x86.sse2.psrl.w( <8 x i16> %tmp832, <8 x i16> < i16 8, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef > )
- %tmp1020 = tail call <8 x i16> @llvm.x86.sse2.packuswb.128( <8 x i16> zeroinitializer, <8 x i16> %tmp838 )
- %tmp1030 = bitcast <8 x i16> %tmp1020 to <4 x i32>
+ %tmp1020 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128( <8 x i16> zeroinitializer, <8 x i16> %tmp838 )
+ %tmp1030 = bitcast <16 x i8> %tmp1020 to <4 x i32>
%tmp1033 = add <4 x i32> zeroinitializer, %tmp1030
%tmp1048 = bitcast <4 x i32> %tmp1033 to <2 x i64>
%tmp1049 = or <2 x i64> %tmp1048, zeroinitializer