/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
ValueType vt, string asm, PatFrag mem_frag,
- Domain d, bit IsConvertibleToThreeAddress = 0> {
+ Domain d> {
def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
(i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
Sched<[WriteFShuffleLd, ReadAfterLd]>;
- let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
- def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
- [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
- (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
- Sched<[WriteFShuffle]>;
+ def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+ (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+ Sched<[WriteFShuffle]>;
}
defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
let Constraints = "$src1 = $dst" in {
defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
"shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, PS;
+ memopv4f32, SSEPackedSingle>, PS;
defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
"shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, PD;
+ memopv2f64, SSEPackedDouble>, PD;
}
let Predicates = [HasAVX] in {
let Predicates = [UseSSE2] in {
// SSE2 patterns to select scalar double-precision fp arithmetic instructions
-
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
(f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
FR64:$src))))),
}
let Predicates = [UseSSE41] in {
- // If the subtarget has SSE4.1 but not AVX, the vector insert
- // instruction is lowered into a X86insertps rather than a X86Movss.
- // When selecting SSE scalar single-precision fp arithmetic instructions,
- // make sure that we correctly match the X86insertps.
+ // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is
+ // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When
+ // selecting SSE scalar single-precision fp arithmetic instructions, make
+ // sure that we correctly match them.
def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
(fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
}
let Predicates = [HasAVX] in {
(fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
}
// Patterns used to select SSE scalar fp arithmetic instructions from
(DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
}
+let Predicates = [UseSSE41] in {
+ // With SSE4.1 we may see these operations using X86Blendi rather than
+ // X86Movs{s,d}.
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+
+ def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
let Predicates = [HasAVX] in {
// The following patterns select AVX Scalar single/double precision fp
// arithmetic instructions from a packed single precision fp instruction
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
(fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
(VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+
+ // Also handle X86Blendi-based patterns.
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+
+ def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
}
/// Unop Arithmetic
PS, Requires<[HasSSE2]>;
} // SchedRW = [WriteStore]
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+ (VMOVNTPSmr addr:$dst, VR128:$src)>;
+}
+
+def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+ (MOVNTPSmr addr:$dst, VR128:$src)>;
+
} // AddedComplexity
//===----------------------------------------------------------------------===//