X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86InstrSSE.td;h=3874c1968b530b5c48b57c0be6ddfe2a6710f232;hb=c31aaa5a3fcef2851898fb30c61c16a70564079a;hp=ea8b6c7cf43731d6a98eb14577afa85eedaa2a58;hpb=15c8452584f38d29a73e98d7b123e408617e55c9;p=oota-llvm.git diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index ea8b6c7cf43..3874c1968b5 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -612,29 +612,6 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // Patterns let Predicates = [UseAVX] in { - let AddedComplexity = 15 in { - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVS{S,D} to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; - - // Move low f32 and clear high bits. - def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4f32 (V_SET0)), - (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>; - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>; - } - let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 @@ -670,31 +647,10 @@ let Predicates = [UseAVX] in { (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), - (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), - sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), - sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; - // Move low f64 and clear high bits. - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2f64 (V_SET0)), - (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>; - - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2i64 (V_SET0)), - (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>; - // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), addr:$dst), @@ -745,7 +701,6 @@ let Predicates = [UseAVX] in { (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), sub_xmm)>; - // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the @@ -761,7 +716,7 @@ let Predicates = [UseAVX] in { } let Predicates = [UseSSE1] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), @@ -795,7 +750,7 @@ let Predicates = [UseSSE1] in { } let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSD to the lower bits. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), @@ -865,6 +820,7 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in Sched<[WriteLoad]>; } +let Predicates = [HasAVX, NoVLX] in { defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, PS, VEX; @@ -890,20 +846,26 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, PD, VEX, VEX_L; +} + +let Predicates = [UseSSE1] in { defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, PS; -defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, - "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - PD; defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, PS; +} +let Predicates = [UseSSE2] in { +defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + PD; defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, PD; +} -let SchedRW = [WriteStore] in { +let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)], @@ -1047,7 +1009,7 @@ let Predicates = [UseSSE2] in (MOVUPDmr addr:$dst, VR128:$src)>; // Use vmovaps/vmovups for AVX integer load/store. -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { // 128-bit load/store def : Pat<(alignedloadv2i64 addr:$src), (VMOVAPSrm addr:$src)>; @@ -1262,6 +1224,9 @@ let Predicates = [HasAVX] in { (VMOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), @@ -1309,6 +1274,9 @@ let Predicates = [UseSSE2] in { (MOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), @@ -1371,6 +1339,11 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (VMOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; } let Predicates = [UseSSE1] in { @@ -1391,6 +1364,11 @@ let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (MOVHPDrm VR128:$src1, addr:$src2)>; } //===----------------------------------------------------------------------===// @@ -2588,18 +2566,17 @@ def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), /// sse12_shuffle - sse 1 & 2 fp shuffle instructions multiclass sse12_shuffle { + Domain d> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, Sched<[WriteFShuffleLd, ReadAfterLd]>; - let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in - def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), asm, - [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, - Sched<[WriteFShuffle]>; + def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, + Sched<[WriteFShuffle]>; } defm VSHUFPS : sse12_shuffle, PS; + memopv4f32, SSEPackedSingle>, PS; defm SHUFPD : sse12_shuffle, PD; + memopv2f64, SSEPackedDouble>, PD; } let Predicates = [HasAVX] in { @@ -3147,7 +3124,6 @@ let Predicates = [UseSSE1] in { let Predicates = [UseSSE2] in { // SSE2 patterns to select scalar double-precision fp arithmetic instructions - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), @@ -3167,10 +3143,10 @@ let Predicates = [UseSSE2] in { } let Predicates = [UseSSE41] in { - // If the subtarget has SSE4.1 but not AVX, the vector insert - // instruction is lowered into a X86insertps rather than a X86Movss. - // When selecting SSE scalar single-precision fp arithmetic instructions, - // make sure that we correctly match the X86insertps. + // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is + // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When + // selecting SSE scalar single-precision fp arithmetic instructions, make + // sure that we correctly match them. def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), @@ -3188,6 +3164,57 @@ let Predicates = [UseSSE41] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } let Predicates = [HasAVX] in { @@ -3226,6 +3253,57 @@ let Predicates = [HasAVX] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } // Patterns used to select SSE scalar fp arithmetic instructions from @@ -3280,6 +3358,49 @@ let Predicates = [UseSSE2] in { (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; } +let Predicates = [UseSSE41] in { + // With SSE4.1 we may see these operations using X86Blendi rather than + // X86Movs{s,d}. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (MULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; +} + let Predicates = [HasAVX] in { // The following patterns select AVX Scalar single/double precision fp // arithmetic instructions from a packed single precision fp instruction @@ -3309,6 +3430,46 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + // Also handle X86Blendi-based patterns. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; } /// Unop Arithmetic @@ -3337,6 +3498,16 @@ def SSE_SQRTSD : OpndItins< >; } +let Sched = WriteFRsqrt in { +def SSE_RSQRTPS : OpndItins< + IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM +>; + +def SSE_RSQRTSS : OpndItins< + IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM +>; +} + let Sched = WriteFRcp in { def SSE_RCPP : OpndItins< IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM @@ -3615,10 +3786,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>, +defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>, sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, - int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>; + int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>; defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, @@ -3697,6 +3868,7 @@ let Predicates = [UseSSE1] in { let AddedComplexity = 400 in { // Prefer non-temporal versions let SchedRW = [WriteStore] in { +let Predicates = [HasAVX, NoVLX] in { def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3737,6 +3909,7 @@ def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), [(alignednontemporalstore (v4i64 VR256:$src), addr:$dst)], IIC_SSE_MOVNT>, VEX, VEX_L; +} def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3766,6 +3939,14 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), PS, Requires<[HasSSE2]>; } // SchedRW = [WriteStore] +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (VMOVNTPSmr addr:$dst, VR128:$src)>; +} + +def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (MOVNTPSmr addr:$dst, VR128:$src)>; + } // AddedComplexity //===----------------------------------------------------------------------===// @@ -5288,6 +5469,13 @@ let Predicates = [HasAVX] in { (VMOVDDUPYrr VR256:$src)>; } +let Predicates = [UseAVX, OptForSize] in { + def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VMOVDDUPrm addr:$src)>; +} + let Predicates = [UseSSE3] in { def : Pat<(X86Movddup (memopv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; @@ -5368,56 +5556,34 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { // Patterns used to select 'addsub' instructions. let Predicates = [HasAVX] in { - // Constant 170 corresponds to the binary mask '10101010'. - // When used as a blend mask, it allows selecting eight elements from two - // input vectors as follow: - // - Even-numbered values in the destination are copied from - // the corresponding elements in the first input vector; - // - Odd-numbered values in the destination are copied from - // the corresponding elements in the second input vector. - - def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)), - (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))), - (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; - - // Constant 10 corresponds to the binary mask '1010'. - // In the two pattens below, constant 10 is used as a blend mask to select - // - the 1st and 3rd element from the first input vector (the 'fsub' node); - // - the 2nd and 4th element from the second input vector (the 'fadd' node). - - def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), - (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), - (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), - (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), - (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), - (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), - (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), - (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), - (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; + + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 (memop addr:$rhs)))), + (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 (memop addr:$rhs)))), + (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; } let Predicates = [UseSSE3] in { - // Constant 10 corresponds to the binary mask '1010'. - // In the pattern below, it is used as a blend mask to select: - // - the 1st and 3rd element from the first input vector (the fsub node); - // - the 2nd and 4th element from the second input vector (the fadd node). - - def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), - (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - - def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), - (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), - (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), - (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; } //===---------------------------------------------------------------------===// @@ -6703,7 +6869,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insertf32 opc, string asm, bit Is2Addr = 1, OpndItins itins = DEFAULT_ITINS> { def rr : SS4AIi8 opc, string asm, bit Is2Addr = 1, (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, Sched<[WriteFShuffle]>; def rm : SS4AIi8 opc, string OpcodeStr, OpndItins itins = DEFAULT_ITINS> { let isCommutable = 1 in def rri : SS4AIi8 opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, Sched<[itins.Sched]>; def rmi : SS4AIi8; } +// Patterns +let Predicates = [UseAVX] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VBLENDPSrri (v4i32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + } + + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), + sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), + sub_xmm)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; + + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; +} + +let Predicates = [UseSSE41] in { + // With SSE41 we can use blends for these patterns. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>; +} + + /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { multiclass SS41I_ternary_int opc, string OpcodeStr, PatFrag mem_frag, @@ -8407,13 +8624,13 @@ multiclass avx_permil opc_rm, bits<8> opc_rmi, string OpcodeStr, def ri : AVXAIi8, VEX, + [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; def mi : AVXAIi8, VEX, + (vt (X86VPermilpi (memop addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; } @@ -8431,19 +8648,37 @@ let ExeDomain = SSEPackedDouble in { } let Predicates = [HasAVX] in { -def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))), +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), + (VPERMILPSYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VPERMILPSYrm VR256:$src1, addr:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))), + (VPERMILPDYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))), + (VPERMILPDYrm VR256:$src1, addr:$src2)>; + +def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPSYri VR256:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPDYri VR256:$src1, imm:$imm)>; -def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)), +def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)), (i8 imm:$imm))), (VPERMILPSYmi addr:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDYmi addr:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))), +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))), + (VPERMILPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), + (VPERMILPSrm VR128:$src1, addr:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))), + (VPERMILPDrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))), + (VPERMILPDrm VR128:$src1, addr:$src2)>; + +def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))), (VPERMILPDri VR128:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDmi addr:$src1, imm:$imm)>; } @@ -8577,13 +8812,13 @@ multiclass AVX2_binop_rmi_int opc, string OpcodeStr, X86MemOperand x86memop> { let isCommutable = 1 in def rri : AVX2AIi8, Sched<[WriteBlend]>, VEX_4V; def rmi : AVX2AIi8; + // Provide aliases for broadcast from the same regitser class that + // automatically does the extract. + def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), + (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), + sub_xmm)))>; + def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))), + (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), + sub_xmm)))>; + def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))), + (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))), + (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), + sub_xmm)))>; + def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), + (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), + (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), + sub_xmm)))>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { @@ -8770,6 +9026,9 @@ let Predicates = [HasAVX] in { (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; } + + def : Pat<(v2f64 (X86VBroadcast f64:$src)), + (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; } //===----------------------------------------------------------------------===//