X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86InstrSSE.td;h=3874c1968b530b5c48b57c0be6ddfe2a6710f232;hb=c31aaa5a3fcef2851898fb30c61c16a70564079a;hp=043b2f32c6ffb3097ecf0ebb158c492fe7f80207;hpb=7cd893a9859dcba1045aef2ca66de95f0561344b;p=oota-llvm.git diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 043b2f32c6f..3874c1968b5 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -181,6 +181,7 @@ def SSE_MPSADBW_ITINS : OpndItins< IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM >; +let Sched = WriteVecIMul in def SSE_PMULLD_ITINS : OpndItins< IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM >; @@ -218,11 +219,21 @@ def DEFAULT_ITINS_BLENDSCHED : OpndItins< IIC_ALU_NONMEM, IIC_ALU_MEM >; +let Sched = WriteVarBlend in +def DEFAULT_ITINS_VARBLENDSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + let Sched = WriteFBlend in def SSE_INTALU_ITINS_FBLEND_P : OpndItins< IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM >; +let Sched = WriteBlend in +def SSE_INTALU_ITINS_BLEND_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + //===----------------------------------------------------------------------===// // SSE 1 & 2 Instructions Classes //===----------------------------------------------------------------------===// @@ -601,29 +612,6 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // Patterns let Predicates = [UseAVX] in { - let AddedComplexity = 15 in { - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVS{S,D} to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; - - // Move low f32 and clear high bits. - def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4f32 (V_SET0)), - (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>; - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>; - } - let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 @@ -659,31 +647,10 @@ let Predicates = [UseAVX] in { (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), - (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), - sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), - sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; - // Move low f64 and clear high bits. - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2f64 (V_SET0)), - (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>; - - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2i64 (V_SET0)), - (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>; - // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), addr:$dst), @@ -734,7 +701,6 @@ let Predicates = [UseAVX] in { (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), sub_xmm)>; - // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the @@ -750,7 +716,7 @@ let Predicates = [UseAVX] in { } let Predicates = [UseSSE1] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), @@ -784,7 +750,7 @@ let Predicates = [UseSSE1] in { } let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSD to the lower bits. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), @@ -854,6 +820,7 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in Sched<[WriteLoad]>; } +let Predicates = [HasAVX, NoVLX] in { defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, PS, VEX; @@ -879,20 +846,26 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, PD, VEX, VEX_L; +} + +let Predicates = [UseSSE1] in { defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, PS; -defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, - "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - PD; defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, PS; +} +let Predicates = [UseSSE2] in { +defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + PD; defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, PD; +} -let SchedRW = [WriteStore] in { +let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)], @@ -1006,7 +979,7 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, - SchedRW = [WriteMove] in { + SchedRW = [WriteFShuffle] in { def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -1036,7 +1009,7 @@ let Predicates = [UseSSE2] in (MOVUPDmr addr:$dst, VR128:$src)>; // Use vmovaps/vmovups for AVX integer load/store. -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { // 128-bit load/store def : Pat<(alignedloadv2i64 addr:$src), (VMOVAPSrm addr:$src)>; @@ -1251,6 +1224,9 @@ let Predicates = [HasAVX] in { (VMOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), @@ -1298,6 +1274,9 @@ let Predicates = [UseSSE2] in { (MOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), @@ -1360,6 +1339,11 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (VMOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; } let Predicates = [UseSSE1] in { @@ -1380,6 +1364,11 @@ let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (MOVHPDrm VR128:$src1, addr:$src2)>; } //===----------------------------------------------------------------------===// @@ -2577,18 +2566,17 @@ def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), /// sse12_shuffle - sse 1 & 2 fp shuffle instructions multiclass sse12_shuffle { + Domain d> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, Sched<[WriteFShuffleLd, ReadAfterLd]>; - let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in - def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), asm, - [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, - Sched<[WriteFShuffle]>; + def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, + Sched<[WriteFShuffle]>; } defm VSHUFPS : sse12_shuffle, PS; + memopv4f32, SSEPackedSingle>, PS; defm SHUFPD : sse12_shuffle, PD; + memopv2f64, SSEPackedDouble>, PD; } let Predicates = [HasAVX] in { @@ -3136,7 +3124,6 @@ let Predicates = [UseSSE1] in { let Predicates = [UseSSE2] in { // SSE2 patterns to select scalar double-precision fp arithmetic instructions - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), @@ -3156,10 +3143,10 @@ let Predicates = [UseSSE2] in { } let Predicates = [UseSSE41] in { - // If the subtarget has SSE4.1 but not AVX, the vector insert - // instruction is lowered into a X86insertps rather than a X86Movss. - // When selecting SSE scalar single-precision fp arithmetic instructions, - // make sure that we correctly match the X86insertps. + // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is + // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When + // selecting SSE scalar single-precision fp arithmetic instructions, make + // sure that we correctly match them. def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), @@ -3177,6 +3164,57 @@ let Predicates = [UseSSE41] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } let Predicates = [HasAVX] in { @@ -3215,6 +3253,57 @@ let Predicates = [HasAVX] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } // Patterns used to select SSE scalar fp arithmetic instructions from @@ -3269,6 +3358,49 @@ let Predicates = [UseSSE2] in { (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; } +let Predicates = [UseSSE41] in { + // With SSE4.1 we may see these operations using X86Blendi rather than + // X86Movs{s,d}. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (MULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; +} + let Predicates = [HasAVX] in { // The following patterns select AVX Scalar single/double precision fp // arithmetic instructions from a packed single precision fp instruction @@ -3298,6 +3430,46 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + // Also handle X86Blendi-based patterns. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; } /// Unop Arithmetic @@ -3326,6 +3498,16 @@ def SSE_SQRTSD : OpndItins< >; } +let Sched = WriteFRsqrt in { +def SSE_RSQRTPS : OpndItins< + IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM +>; + +def SSE_RSQRTSS : OpndItins< + IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM +>; +} + let Sched = WriteFRcp in { def SSE_RCPP : OpndItins< IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM @@ -3604,10 +3786,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>, +defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>, sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, - int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>; + int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>; defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, @@ -3686,6 +3868,7 @@ let Predicates = [UseSSE1] in { let AddedComplexity = 400 in { // Prefer non-temporal versions let SchedRW = [WriteStore] in { +let Predicates = [HasAVX, NoVLX] in { def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3726,6 +3909,7 @@ def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), [(alignednontemporalstore (v4i64 VR256:$src), addr:$dst)], IIC_SSE_MOVNT>, VEX, VEX_L; +} def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3755,6 +3939,14 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), PS, Requires<[HasSSE2]>; } // SchedRW = [WriteStore] +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (VMOVNTPSmr addr:$dst, VR128:$src)>; +} + +def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (MOVNTPSmr addr:$dst, VR128:$src)>; + } // AddedComplexity //===----------------------------------------------------------------------===// @@ -4336,20 +4528,6 @@ defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, SSE_INTALU_ITINS_P, 0>; -//===---------------------------------------------------------------------===// -// SSE2 - Packed Integer Pack Instructions -//===---------------------------------------------------------------------===// - -defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128, - int_x86_avx2_packsswb, - SSE_INTALU_ITINS_SHUFF_P, 0>; -defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128, - int_x86_avx2_packssdw, - SSE_INTALU_ITINS_SHUFF_P, 0>; -defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128, - int_x86_avx2_packuswb, - SSE_INTALU_ITINS_SHUFF_P, 0>; - //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Shuffle Instructions //===---------------------------------------------------------------------===// @@ -4431,6 +4609,136 @@ let Predicates = [UseSSE2] in { (PSHUFDri VR128:$src1, imm:$imm)>; } +//===---------------------------------------------------------------------===// +// Packed Integer Pack Instructions (SSE & AVX) +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pack opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + bit Is2Addr = 1> { + def rr : PDI, + Sched<[WriteShuffle]>; + def rm : PDI, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse2_pack_y opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : PDI, + Sched<[WriteShuffle]>; + def Yrm : PDI, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + bit Is2Addr = 1> { + def rr : SS48I, + Sched<[WriteShuffle]>; + def rm : SS48I, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack_y opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : SS48I, + Sched<[WriteShuffle]>; + def Yrm : SS48I, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, + bc_v8i16, 0>, VEX_4V; + defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, + bc_v4i32, 0>, VEX_4V; + + defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, + bc_v8i16, 0>, VEX_4V; + defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, + bc_v4i32, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, + bc_v8i32>, VEX_4V, VEX_L; + + defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, + bc_v8i32>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, + bc_v8i16>; + defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, + bc_v4i32>; + + defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, + bc_v8i16>; + + let Predicates = [HasSSE41] in + defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, + bc_v4i32>; +} +} // ExeDomain = SSEPackedInt + //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Unpack Instructions //===---------------------------------------------------------------------===// @@ -5161,6 +5469,13 @@ let Predicates = [HasAVX] in { (VMOVDDUPYrr VR256:$src)>; } +let Predicates = [UseAVX, OptForSize] in { + def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VMOVDDUPrm addr:$src)>; +} + let Predicates = [UseSSE3] in { def : Pat<(X86Movddup (memopv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; @@ -5239,6 +5554,38 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { f128mem, SSE_ALU_F64P>, PD; } +// Patterns used to select 'addsub' instructions. +let Predicates = [HasAVX] in { + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), + (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; + + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 (memop addr:$rhs)))), + (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 (memop addr:$rhs)))), + (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; +} + +let Predicates = [UseSSE3] in { + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), + (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; +} + //===---------------------------------------------------------------------===// // SSE3 Instructions //===---------------------------------------------------------------------===// @@ -6522,7 +6869,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insertf32 opc, string asm, bit Is2Addr = 1, OpndItins itins = DEFAULT_ITINS> { def rr : SS4AIi8 opc, string asm, bit Is2Addr = 1, (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, Sched<[WriteFShuffle]>; def rm : SS4AIi8 opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX] in { let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, - 0, DEFAULT_ITINS_SHUFFLESCHED>, VEX_4V; defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; @@ -7086,9 +7431,6 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", - int_x86_avx2_packusdw, WriteShuffle>, - VEX_4V, VEX_L; defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; @@ -7120,8 +7462,6 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { let isCommutable = 0 in - defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw, - 1, DEFAULT_ITINS_SHUFFLESCHED>; defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128, @@ -7145,7 +7485,7 @@ let Constraints = "$src1 = $dst" in { let Predicates = [HasAVX] in { defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, - memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>, VEX_4V; defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, @@ -7153,7 +7493,7 @@ let Predicates = [HasAVX] in { } let Predicates = [HasAVX2] in { defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, - memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + memopv4i64, i256mem, 0, SSE_PMULLD_ITINS>, VEX_4V, VEX_L; defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, @@ -7174,7 +7514,7 @@ multiclass SS41I_binop_rmi_int opc, string OpcodeStr, OpndItins itins = DEFAULT_ITINS> { let isCommutable = 1 in def rri : SS4AIi8 opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, Sched<[itins.Sched]>; def rmi : SS4AIi8; defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, VR128, memopv2i64, i128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; + 1, SSE_INTALU_ITINS_BLEND_P>; defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, VR128, memopv2i64, i128mem, 1, SSE_MPSADBW_ITINS>; @@ -7382,6 +7722,57 @@ let Predicates = [HasAVX2] in { (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; } +// Patterns +let Predicates = [UseAVX] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VBLENDPSrri (v4i32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + } + + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), + sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), + sub_xmm)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; + + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; +} + +let Predicates = [UseSSE41] in { + // With SSE41 we can use blends for these patterns. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>; +} + + /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { multiclass SS41I_ternary_int opc, string OpcodeStr, PatFrag mem_frag, @@ -7392,7 +7783,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))], - itins.rr>; + itins.rr>, Sched<[itins.Sched]>; def rm0 : SS48I; + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } let ExeDomain = SSEPackedDouble in defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, - int_x86_sse41_blendvpd>; + int_x86_sse41_blendvpd, + DEFAULT_ITINS_FBLENDSCHED>; let ExeDomain = SSEPackedSingle in defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, - int_x86_sse41_blendvps>; + int_x86_sse41_blendvps, + DEFAULT_ITINS_FBLENDSCHED>; defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, - int_x86_sse41_pblendvb>; + int_x86_sse41_pblendvb, + DEFAULT_ITINS_VARBLENDSCHED>; // Aliases with the implicit xmm0 argument def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", @@ -8230,13 +8624,13 @@ multiclass avx_permil opc_rm, bits<8> opc_rmi, string OpcodeStr, def ri : AVXAIi8, VEX, + [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; def mi : AVXAIi8, VEX, + (vt (X86VPermilpi (memop addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; } @@ -8254,19 +8648,37 @@ let ExeDomain = SSEPackedDouble in { } let Predicates = [HasAVX] in { -def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))), +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), + (VPERMILPSYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VPERMILPSYrm VR256:$src1, addr:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))), + (VPERMILPDYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))), + (VPERMILPDYrm VR256:$src1, addr:$src2)>; + +def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPSYri VR256:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPDYri VR256:$src1, imm:$imm)>; -def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)), +def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)), (i8 imm:$imm))), (VPERMILPSYmi addr:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDYmi addr:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))), +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))), + (VPERMILPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), + (VPERMILPSrm VR128:$src1, addr:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))), + (VPERMILPDrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))), + (VPERMILPDrm VR128:$src1, addr:$src2)>; + +def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))), (VPERMILPDri VR128:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDmi addr:$src1, imm:$imm)>; } @@ -8375,6 +8787,21 @@ let Predicates = [HasF16C] in { (VCVTPH2PSrm addr:$src)>; } +// Patterns for matching conversions from float to half-float and vice versa. +let Predicates = [HasF16C] in { + def : Pat<(fp_to_f16 FR32:$src), + (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr + (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>; + + def : Pat<(f16_to_fp GR16:$src), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; + + def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >; +} + //===----------------------------------------------------------------------===// // AVX2 Instructions //===----------------------------------------------------------------------===// @@ -8385,13 +8812,13 @@ multiclass AVX2_binop_rmi_int opc, string OpcodeStr, X86MemOperand x86memop> { let isCommutable = 1 in def rri : AVX2AIi8, Sched<[WriteBlend]>, VEX_4V; def rmi : AVX2AIi8; + // Provide aliases for broadcast from the same regitser class that + // automatically does the extract. + def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), + (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), + sub_xmm)))>; + def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))), + (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), + sub_xmm)))>; + def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))), + (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))), + (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), + sub_xmm)))>; + def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), + (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), + (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), + sub_xmm)))>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { @@ -8578,6 +9026,9 @@ let Predicates = [HasAVX] in { (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; } + + def : Pat<(v2f64 (X86VBroadcast f64:$src)), + (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; } //===----------------------------------------------------------------------===// @@ -8585,14 +9036,14 @@ let Predicates = [HasAVX] in { // multiclass avx2_perm opc, string OpcodeStr, PatFrag mem_frag, - ValueType OpVT> { + ValueType OpVT, X86FoldableSchedWrite Sched> { def Yrr : AVX28I, - Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; + Sched<[Sched]>, VEX_4V, VEX_L; def Yrm : AVX28I opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, (bitconvert (mem_frag addr:$src2)))))]>, - Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; + Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; } -defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>; +defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>; let ExeDomain = SSEPackedSingle in -defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>; +defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>; multiclass avx2_perm_imm opc, string OpcodeStr, PatFrag mem_frag, - ValueType OpVT> { + ValueType OpVT, X86FoldableSchedWrite Sched> { def Yri : AVX2AIi8, - Sched<[WriteShuffle256]>, VEX, VEX_L; + Sched<[Sched]>, VEX, VEX_L; def Ymi : AVX2AIi8 opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermi (mem_frag addr:$src1), (i8 imm:$src2))))]>, - Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX, VEX_L; + Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; } -defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W; +defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, + WriteShuffle256>, VEX_W; let ExeDomain = SSEPackedDouble in -defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W; +defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, + WriteFShuffle256>, VEX_W; //===----------------------------------------------------------------------===// // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks