%tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
+
+declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+; CHECK: vqdmull_4xi16
+define <4 x i32> @vqdmull_4xi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+; CHECK: vqdmull.s16 q8, d16, d17 @ encoding: [0xa1,0x0d,0xd0,0xf2]
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+; CHECK: vqdmull_2xi32
+define <2 x i64> @vqdmull_2xi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+; CHECK: vqdmull.s32 q8, d16, d17 @ encoding: [0xa1,0x0d,0xe0,0xf2]
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}