X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86InstrSSE.td;h=327d9a7f5154de271fcfdcfa3be61bfebb006ffe;hb=453f4954f2189445d73d9303a0b927cb955af559;hp=7a6a6250e7f94e8b570324793c84d4d6d3817b06;hpb=07b7f672a066317ffc8224c61dc3b72ce857f24d;p=oota-llvm.git diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 7a6a6250e7f..327d9a7f515 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -119,9 +119,42 @@ multiclass sse12_fp_packed_int opc, string OpcodeStr, RegisterClass RC, // Non-instruction patterns //===----------------------------------------------------------------------===// +// A vector extract of the first f32 position is a subregister copy def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; +// A 128-bit subvector extract from the first 256-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))), + (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>; +def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))), + (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>; + +def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))), + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>; +def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))), + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>; + +def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))), + (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>; +def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))), + (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>; + +// A 128-bit subvector insert to the first 256-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)), + (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)), + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)), + (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)), + (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; + // Implicitly promote a 32-bit scalar to a vector. def : Pat<(v4f32 (scalar_to_vector FR32:$src)), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>; @@ -262,7 +295,13 @@ def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), (SUBREG_TO_REG (i64 0), (AVX_SET0PI), sub_xmm)>; //===----------------------------------------------------------------------===// -// SSE 1 & 2 - Move Instructions +// SSE 1 & 2 - Move FP Scalar Instructions +// +// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 +// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr +// is used instead. Register-to-register movss/movsd is not modeled as an +// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable +// in terms of a copy, and just mentioned, we don't use movss/movsd for copies. //===----------------------------------------------------------------------===// class sse12_move_rr : @@ -276,11 +315,7 @@ class sse12_move_rm; -// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 -// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr -// is used instead. Register-to-register movss/movsd is not modeled as an -// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable -// in terms of a copy, and just mentioned, we don't use movss/movsd for copies. +// AVX def VMOVSSrr : sse12_move_rr, XS, VEX_4V; def VMOVSDrr : sse12_move_rr, XS, VEX; - let AddedComplexity = 20 in def VMOVSDrm : sse12_move_rm, XD, VEX; } +def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), + "movss\t{$src, $dst|$dst, $src}", + [(store FR32:$src, addr:$dst)]>, XS, VEX; +def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), + "movsd\t{$src, $dst|$dst, $src}", + [(store FR64:$src, addr:$dst)]>, XD, VEX; + +// SSE1 & 2 let Constraints = "$src1 = $dst" in { def MOVSSrr : sse12_move_rr, XS; @@ -307,19 +349,37 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { def MOVSDrm : sse12_move_rm, XD; } -let AddedComplexity = 15 in { -// Extract the low 32-bit value from one vector and insert it into another. -def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)), - (MOVSSrr (v4f32 VR128:$src1), - (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; -// Extract the low 64-bit value from one vector and insert it into another. -def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)), - (MOVSDrr (v2f64 VR128:$src1), - (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; -} +def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), + "movss\t{$src, $dst|$dst, $src}", + [(store FR32:$src, addr:$dst)]>; +def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), + "movsd\t{$src, $dst|$dst, $src}", + [(store FR64:$src, addr:$dst)]>; -let AddedComplexity = 20 in { +// Patterns let Predicates = [HasSSE1] in { + let AddedComplexity = 15 in { + // Extract the low 32-bit value from one vector and insert it into another. + def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)), + (MOVSSrr (v4f32 VR128:$src1), + (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; + def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)), + (MOVSSrr (v4i32 VR128:$src1), + (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; + + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSS to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (MOVSSrr (v4f32 (V_SET0PS)), + (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (MOVSSrr (v4i32 (V_SET0PI)), + (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; + } + + let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), @@ -328,8 +388,48 @@ let Predicates = [HasSSE1] in { (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; + } + + // Extract and store. + def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSSmr addr:$dst, + (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + + // Shuffle with MOVSS + def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))), + (MOVSSrr VR128:$src1, FR32:$src2)>; + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (MOVSSrr (v4i32 VR128:$src1), + (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (MOVSSrr (v4f32 VR128:$src1), + (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; } + let Predicates = [HasSSE2] in { + let AddedComplexity = 15 in { + // Extract the low 64-bit value from one vector and insert it into another. + def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)), + (MOVSDrr (v2f64 VR128:$src1), + (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; + def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)), + (MOVSDrr (v2i64 VR128:$src1), + (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; + + // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd + def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; + def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; + + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSD to the lower bits. + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>; + } + + let AddedComplexity = 20 in { // MOVSDrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), @@ -342,66 +442,161 @@ let Predicates = [HasSSE2] in { (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; def : Pat<(v2f64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; -} + } + + // Extract and store. + def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSDmr addr:$dst, + (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; + + // Shuffle with MOVSD + def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))), + (MOVSDrr VR128:$src1, FR64:$src2)>; + def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr (v2i64 VR128:$src1), + (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr (v2f64 VR128:$src1), + (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; + def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>; + def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>; + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem + // is during lowering, where it's not possible to recognize the fold cause + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>; } -let AddedComplexity = 20, Predicates = [HasAVX] in { -// MOVSSrm zeros the high parts of the register; represent this -// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 -def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; -def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), - (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; -def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; -// MOVSDrm zeros the high parts of the register; represent this -// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 -def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), - (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; -def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), - (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; -def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; -def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), - (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; -def : Pat<(v2f64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; -// Represent the same patterns above but in the form they appear for -// 256-bit types -def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; -def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_sd)>; -} - -// Store scalar value to memory. -def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), - "movss\t{$src, $dst|$dst, $src}", - [(store FR32:$src, addr:$dst)]>; -def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), - "movsd\t{$src, $dst|$dst, $src}", - [(store FR64:$src, addr:$dst)]>; +let Predicates = [HasAVX] in { + let AddedComplexity = 15 in { + // Extract the low 32-bit value from one vector and insert it into another. + def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4f32 VR128:$src1), + (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; + def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4i32 VR128:$src1), + (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; + + // Extract the low 64-bit value from one vector and insert it into another. + def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)), + (VMOVSDrr (v2f64 VR128:$src1), + (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; + def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)), + (VMOVSDrr (v2i64 VR128:$src1), + (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; + + // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd + def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; + def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; + + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (VMOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VMOVSSrr (v4f32 (V_SET0PS)), + (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VMOVSSrr (v4i32 (V_SET0PI)), + (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (VMOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>; + } -def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), - "movss\t{$src, $dst|$dst, $src}", - [(store FR32:$src, addr:$dst)]>, XS, VEX; -def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), - "movsd\t{$src, $dst|$dst, $src}", - [(store FR64:$src, addr:$dst)]>, XD, VEX; + let AddedComplexity = 20 in { + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; -// Extract and store. -def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), - addr:$dst), - (MOVSSmr addr:$dst, - (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; -def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), - addr:$dst), - (MOVSDmr addr:$dst, - (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + + // Represent the same patterns above but in the form they appear for + // 256-bit types + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_sd)>; + } + + // Extract and store. + def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (VMOVSSmr addr:$dst, + (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + addr:$dst), + (VMOVSDmr addr:$dst, + (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; + + // Shuffle with VMOVSS + def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))), + (VMOVSSrr VR128:$src1, FR32:$src2)>; + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4i32 VR128:$src1), + (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4f32 VR128:$src1), + (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; + + // Shuffle with VMOVSD + def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))), + (VMOVSDrr VR128:$src1, FR64:$src2)>; + def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr (v2i64 VR128:$src1), + (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr (v2f64 VR128:$src1), + (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; + def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), + sub_sd))>; + def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), + sub_sd))>; + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem + // is during lowering, where it's not possible to recognize the fold cause + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), + sub_sd))>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), + sub_sd))>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions +//===----------------------------------------------------------------------===// -// Move Aligned/Unaligned floating point values multiclass sse12_mov_packed opc, RegisterClass RC, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d, @@ -416,22 +611,22 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in } defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, - "movaps", SSEPackedSingle>, VEX; + "movaps", SSEPackedSingle>, TB, VEX; defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, - "movapd", SSEPackedDouble>, OpSize, VEX; + "movapd", SSEPackedDouble>, TB, OpSize, VEX; defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, - "movups", SSEPackedSingle>, VEX; + "movups", SSEPackedSingle>, TB, VEX; defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, - "movupd", SSEPackedDouble, 0>, OpSize, VEX; + "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX; defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, - "movaps", SSEPackedSingle>, VEX; + "movaps", SSEPackedSingle>, TB, VEX; defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, - "movapd", SSEPackedDouble>, OpSize, VEX; + "movapd", SSEPackedDouble>, TB, OpSize, VEX; defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, - "movups", SSEPackedSingle>, VEX; + "movups", SSEPackedSingle>, TB, VEX; defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, - "movupd", SSEPackedDouble, 0>, OpSize, VEX; + "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX; defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle>, TB; defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, @@ -487,22 +682,19 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movupd\t{$src, $dst|$dst, $src}", [(store (v2f64 VR128:$src), addr:$dst)]>; -// Intrinsic forms of MOVUPS/D load and store -def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movups\t{$src, $dst|$dst, $src}", - [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX; -def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX; - -def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movups\t{$src, $dst|$dst, $src}", - [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>; -def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>; +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), + (VMOVUPDmr addr:$dst, VR128:$src)>; +} + +let Predicates = [HasSSE1] in + def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), + (MOVUPSmr addr:$dst, VR128:$src)>; +let Predicates = [HasSSE2] in + def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), + (MOVUPDmr addr:$dst, VR128:$src)>; // Move Low/High packed floating point values multiclass sse12_mov_hilo_packedopc, RegisterClass RC, @@ -658,11 +850,12 @@ let Predicates = [HasSSE1] in { // MOVHPS patterns def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), (MOVHPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Movlhps VR128:$src1, - (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), (MOVHPSrm VR128:$src1, addr:$src2)>; // MOVLHPS patterns @@ -715,14 +908,6 @@ multiclass sse12_cvt_s opc, RegisterClass SrcRC, RegisterClass DstRC, [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>; } -multiclass sse12_cvt_s_np opc, RegisterClass SrcRC, RegisterClass DstRC, - X86MemOperand x86memop, string asm> { - def rr : SI; - def rm : SI; -} - multiclass sse12_cvt_p opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d> { @@ -844,10 +1029,12 @@ defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, // Get rid of this hack or rename the intrinsics, there are several // intructions that only match with the intrinsic form, why create duplicates // to let them be recognized by the assembler? -defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem, +let Pattern = [] in { +defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, undef, f64mem, load, "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; -defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem, +defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, undef, f64mem, load, "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W; +} defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, f128mem, load, "cvtsd2si{l}">, XD; defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, @@ -1220,13 +1407,13 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), let Predicates = [HasAVX] in { // SSE2 instructions without OpSize prefix def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; } def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB; @@ -1236,12 +1423,12 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, - VEX, Requires<[HasAVX]>; + TB, VEX, Requires<[HasAVX]>; def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd (load addr:$src)))]>, - VEX, Requires<[HasAVX]>; + TB, VEX, Requires<[HasAVX]>; def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, @@ -1451,25 +1638,25 @@ multiclass sse12_ord_cmp opc, RegisterClass RC, SDNode OpNode, let Defs = [EFLAGS] in { defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss", SSEPackedSingle>, VEX; + "ucomiss", SSEPackedSingle>, TB, VEX; defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd", SSEPackedDouble>, OpSize, VEX; + "ucomisd", SSEPackedDouble>, TB, OpSize, VEX; let Pattern = [] in { defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, - "comiss", SSEPackedSingle>, VEX; + "comiss", SSEPackedSingle>, TB, VEX; defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, - "comisd", SSEPackedDouble>, OpSize, VEX; + "comisd", SSEPackedDouble>, TB, OpSize, VEX; } defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss", SSEPackedSingle>, VEX; + load, "ucomiss", SSEPackedSingle>, TB, VEX; defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd", SSEPackedDouble>, OpSize, VEX; + load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX; defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, - load, "comiss", SSEPackedSingle>, VEX; + load, "comiss", SSEPackedSingle>, TB, VEX; defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, - load, "comisd", SSEPackedDouble>, OpSize, VEX; + load, "comisd", SSEPackedDouble>, TB, OpSize, VEX; defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, "ucomiss", SSEPackedSingle>, TB; defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, @@ -1518,19 +1705,19 @@ multiclass sse12_cmp_packed, VEX_4V; + SSEPackedSingle>, TB, VEX_4V; defm VCMPPD : sse12_cmp_packed, OpSize, VEX_4V; + SSEPackedDouble>, TB, OpSize, VEX_4V; defm VCMPPSY : sse12_cmp_packed, VEX_4V; + SSEPackedSingle>, TB, VEX_4V; defm VCMPPDY : sse12_cmp_packed, OpSize, VEX_4V; + SSEPackedDouble>, TB, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed opc, PatFrag OpNode, ValueType vt, let AddedComplexity = 10 in { defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; + SSEPackedSingle>, TB, VEX_4V; defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; + SSEPackedDouble>, TB, OpSize, VEX_4V; defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; + SSEPackedSingle>, TB, VEX_4V; defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; + SSEPackedDouble>, TB, OpSize, VEX_4V; defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32, VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; + SSEPackedSingle>, TB, VEX_4V; defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64, VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; + SSEPackedDouble>, TB, OpSize, VEX_4V; defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32, VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; + SSEPackedSingle>, TB, VEX_4V; defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; + SSEPackedDouble>, TB, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, @@ -1963,14 +2150,14 @@ let Predicates = [HasAVX] in { // Assembler Only def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX; def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB, OpSize, VEX; def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX; def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB, OpSize, VEX; } @@ -3184,7 +3371,7 @@ def mi : Ii8<0x70, MRMSrcMem, let Predicates = [HasAVX] in { let AddedComplexity = 5 in - defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, OpSize, + defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize, VEX; // SSE2 with ImmT == Imm8 and XS prefix. @@ -3204,14 +3391,14 @@ let Predicates = [HasAVX] in { def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)), (i8 imm:$imm))), - (VPSHUFDmi addr:$src1, imm:$imm)>, Requires<[HasAVX]>; + (VPSHUFDmi addr:$src1, imm:$imm)>; def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)), (i8 imm:$imm))), (VPSHUFDmi addr:$src1, imm:$imm)>; def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), - (VPSHUFDri VR128:$src1, imm:$imm)>, Requires<[HasAVX]>; + (VPSHUFDri VR128:$src1, imm:$imm)>; def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))), - (VPSHUFDri VR128:$src1, imm:$imm)>, Requires<[HasAVX]>; + (VPSHUFDri VR128:$src1, imm:$imm)>; def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))), (VPSHUFHWri VR128:$src, imm:$imm)>; def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)), @@ -3399,7 +3586,7 @@ def VPEXTRWri : Ii8<0xC5, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), - imm:$src2))]>, OpSize, VEX; + imm:$src2))]>, TB, OpSize, VEX; def PEXTRWri : PDIi8<0xC5, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -3408,11 +3595,11 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, // Insert let Predicates = [HasAVX] in { - defm VPINSRW : sse2_pinsrw<0>, OpSize, VEX_4V; + defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V; def VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, OpSize, VEX_4V; + []>, TB, OpSize, VEX_4V; } let Constraints = "$src1 = $dst" in @@ -4010,6 +4197,20 @@ let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + + // 256-bit version + def : Pat<(X86Movddup (memopv4f64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (memopv4i64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4f64 (scalar_to_vector (loadf64 addr:$src)))), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4f64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; + def : Pat<(X86Movddup (v4i64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; } //===---------------------------------------------------------------------===// @@ -4350,22 +4551,6 @@ let Predicates = [HasSSE2] in def : Pat<(fextend (loadf32 addr:$src)), (CVTSS2SDrm addr:$src)>; -// Move scalar to XMM zero-extended -// movd to XMM register zero-extends -let AddedComplexity = 15 in { -// Zeroing a VR128 then do a MOVS{S|D} to the lower bits. -def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>; -def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>; -def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (MOVSSrr (v4f32 (V_SET0PS)), - (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>; -def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (MOVSSrr (v4i32 (V_SET0PI)), - (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; -} - // Splat v2f64 / v2i64 let AddedComplexity = 10 in { def : Pat<(splat_lo (v2i64 VR128:$src), (undef)), @@ -4395,24 +4580,6 @@ def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), (MOVLPDmr addr:$src1, VR128:$src2)>; -let AddedComplexity = 15 in { -// Setting the lowest element in the vector. -def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)), - (MOVSSrr (v4i32 VR128:$src1), - (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; -def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)), - (MOVSDrr (v2i64 VR128:$src1), - (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; - -// vector_shuffle v1, v2 <4, 5, 2, 3> using movsd -def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>, - Requires<[HasSSE2]>; -def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>, - Requires<[HasSSE2]>; -} - // Set lowest element and zero upper elements. def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>; @@ -5943,20 +6110,6 @@ def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; -// Special COPY patterns -def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)), - (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)), - (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)), - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)), - (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)), - (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)), - (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; - //===----------------------------------------------------------------------===// // VEXTRACTF128 - Extract packed floating-point values // @@ -6001,23 +6154,6 @@ def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), (v32i8 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -// Special COPY patterns -def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))), - (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>; -def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))), - (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>; - -def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>; -def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>; - -def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))), - (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>; -def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))), - (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>; - - //===----------------------------------------------------------------------===// // VMASKMOV - Conditional SIMD Packed Loads and Stores // @@ -6159,14 +6295,13 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { // Zero All YMM registers def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", - [(int_x86_avx_vzeroall)]>, VEX, VEX_L, Requires<[HasAVX]>; + [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>; + // Zero Upper bits of YMM registers + def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", + [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>; } -// Zero Upper bits of YMM registers -def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", - [(int_x86_avx_vzeroupper)]>, VEX, Requires<[HasAVX]>; - //===----------------------------------------------------------------------===// // SSE Shuffle pattern fragments //===----------------------------------------------------------------------===// @@ -6190,30 +6325,6 @@ def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; -// Shuffle with MOVSS -def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))), - (MOVSSrr VR128:$src1, FR32:$src2)>; -def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), - (MOVSSrr (v4i32 VR128:$src1), - (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; -def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), - (MOVSSrr (v4f32 VR128:$src1), - (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; - -// Shuffle with MOVSD -def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))), - (MOVSDrr VR128:$src1, FR64:$src2)>; -def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr (v2i64 VR128:$src1), - (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; -def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr (v2f64 VR128:$src1), - (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; -def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>; -def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>; - // Shuffle with MOVLPS def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), (MOVLPSrm VR128:$src1, addr:$src2)>; @@ -6222,15 +6333,6 @@ def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), def : Pat<(X86Movlps VR128:$src1, (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), (MOVLPSrm VR128:$src1, addr:$src2)>; -// FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem -// is during lowering, where it's not possible to recognize the load fold cause -// it has two uses through a bitcast. One use disappears at isel time and the -// fold opportunity reappears. -def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>; - -def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>; // Shuffle with MOVLPD def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),