def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
(VPBROADCASTDrZrr GR32:$src)>;
+def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))),
+ (VPBROADCASTDrZkrr VK16WM:$mask, GR32:$src)>;
def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
(VPBROADCASTQrZrr GR64:$src)>;
def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))),
(VMOVZPQILo2PQIZrm addr:$src)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
(VMOVZPQILo2PQIZrr VR128X:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)),
+ (VMOVZPQILo2PQIZrm addr:$src)>;
}
// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl
+
+define <16 x i32> @test_vbroadcast() {
+entry:
+ %0 = sext <16 x i1> zeroinitializer to <16 x i32>
+ %1 = fcmp uno <16 x float> undef, zeroinitializer
+ %2 = sext <16 x i1> %1 to <16 x i32>
+ %3 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> %2
+ ret <16 x i32> %3
+}
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl
+
+define <8 x i16> @test_zext_load() {
+entry:
+ %0 = load <2 x i16> ** undef, align 8
+ %1 = getelementptr inbounds <2 x i16>* %0, i64 1
+ %2 = load <2 x i16>* %0, align 1
+ %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %4 = load <2 x i16>* %1, align 1
+ %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i16> %6
+}