-let Predicates = [HasSSE1] in {
- let AddedComplexity = 15 in {
- // Extract the low 32-bit value from one vector and insert it into another.
- def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
- (MOVSSrr (v4f32 VR128:$src1),
- (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
- def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
- (MOVSSrr (v4i32 VR128:$src1),
- (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
-
- // Move scalar to XMM zero-extended, zeroing a VR128 then do a
- // MOVSS to the lower bits.
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
- (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
- def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
- (MOVSSrr (v4f32 (V_SET0)),
- (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
- (MOVSSrr (v4i32 (V_SET0)),
- (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
- }
-
- let AddedComplexity = 20 in {
- // MOVSSrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG.
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
- def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
- (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
- }
-
- // Extract and store.
- def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
- addr:$dst),
- (MOVSSmr addr:$dst,
- (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
-
- // Shuffle with MOVSS
- def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
- (MOVSSrr VR128:$src1, FR32:$src2)>;
- def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
- (MOVSSrr (v4i32 VR128:$src1),
- (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
- def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
- (MOVSSrr (v4f32 VR128:$src1),
- (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
-}
-
-let Predicates = [HasSSE2] in {
- let AddedComplexity = 15 in {
- // Extract the low 64-bit value from one vector and insert it into another.
- def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
- (MOVSDrr (v2f64 VR128:$src1),
- (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
- def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
- (MOVSDrr (v2i64 VR128:$src1),
- (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
-
- // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
- def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
- def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
-
- // Move scalar to XMM zero-extended, zeroing a VR128 then do a
- // MOVSD to the lower bits.
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
- (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
- }
-
- let AddedComplexity = 20 in {
- // MOVSDrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG.
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
- (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
- def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
- (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
- def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
- (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
- def : Pat<(v2f64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
- }
-
- // Extract and store.
- def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
- addr:$dst),
- (MOVSDmr addr:$dst,
- (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
-
- // Shuffle with MOVSD
- def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
- (MOVSDrr VR128:$src1, FR64:$src2)>;
- def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr (v2i64 VR128:$src1),
- (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
- def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr (v2f64 VR128:$src1),
- (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
- def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
- def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
-
- // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
- // is during lowering, where it's not possible to recognize the fold cause
- // it has two uses through a bitcast. One use disappears at isel time and the
- // fold opportunity reappears.
- def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),sub_sd))>;
- def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2),sub_sd))>;
- def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
- def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
-}
-