1 //====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file describes the X86 SSE instruction set, defining the instructions,
11 // and properties of the instructions which are needed for code generation,
12 // machine code emission, and analysis.
14 //===----------------------------------------------------------------------===//
17 //===----------------------------------------------------------------------===//
18 // SSE specific DAG Nodes.
19 //===----------------------------------------------------------------------===//
21 def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
22 SDTCisFP<0>, SDTCisInt<2> ]>;
24 def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
25 def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
26 def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
27 [SDNPCommutative, SDNPAssociative]>;
28 def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
29 [SDNPCommutative, SDNPAssociative]>;
30 def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
31 [SDNPCommutative, SDNPAssociative]>;
32 def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
33 def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
34 def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>;
35 def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
36 def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
37 def X86pextrb : SDNode<"X86ISD::PEXTRB",
38 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
39 def X86pextrw : SDNode<"X86ISD::PEXTRW",
40 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
41 def X86pinsrb : SDNode<"X86ISD::PINSRB",
42 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
43 SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
44 def X86pinsrw : SDNode<"X86ISD::PINSRW",
45 SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
46 SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
47 def X86insrtps : SDNode<"X86ISD::INSERTPS",
48 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
49 SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>;
50 def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
51 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
52 def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
53 [SDNPHasChain, SDNPMayLoad]>;
55 //===----------------------------------------------------------------------===//
56 // SSE Complex Patterns
57 //===----------------------------------------------------------------------===//
59 // These are 'extloads' from a scalar to the low element of a vector, zeroing
60 // the top elements. These are used for the SSE 'ss' and 'sd' instruction
62 def sse_load_f32 : ComplexPattern<v4f32, 4, "SelectScalarSSELoad", [],
63 [SDNPHasChain, SDNPMayLoad]>;
64 def sse_load_f64 : ComplexPattern<v2f64, 4, "SelectScalarSSELoad", [],
65 [SDNPHasChain, SDNPMayLoad]>;
67 def ssmem : Operand<v4f32> {
68 let PrintMethod = "printf32mem";
69 let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
71 def sdmem : Operand<v2f64> {
72 let PrintMethod = "printf64mem";
73 let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
76 //===----------------------------------------------------------------------===//
77 // SSE pattern fragments
78 //===----------------------------------------------------------------------===//
80 def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
81 def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
82 def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
83 def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
85 // Like 'store', but always requires vector alignment.
86 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
87 (st node:$val, node:$ptr), [{
88 if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
89 return !ST->isTruncatingStore() &&
90 ST->getAddressingMode() == ISD::UNINDEXED &&
91 ST->getAlignment() >= 16;
95 // Like 'load', but always requires vector alignment.
96 def alignedload : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
97 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
98 return LD->getExtensionType() == ISD::NON_EXTLOAD &&
99 LD->getAddressingMode() == ISD::UNINDEXED &&
100 LD->getAlignment() >= 16;
104 def alignedloadfsf32 : PatFrag<(ops node:$ptr), (f32 (alignedload node:$ptr))>;
105 def alignedloadfsf64 : PatFrag<(ops node:$ptr), (f64 (alignedload node:$ptr))>;
106 def alignedloadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (alignedload node:$ptr))>;
107 def alignedloadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (alignedload node:$ptr))>;
108 def alignedloadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (alignedload node:$ptr))>;
109 def alignedloadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (alignedload node:$ptr))>;
111 // Like 'load', but uses special alignment checks suitable for use in
112 // memory operands in most SSE instructions, which are required to
113 // be naturally aligned on some targets but not on others.
114 // FIXME: Actually implement support for targets that don't require the
115 // alignment. This probably wants a subtarget predicate.
116 def memop : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
117 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
118 return LD->getExtensionType() == ISD::NON_EXTLOAD &&
119 LD->getAddressingMode() == ISD::UNINDEXED &&
120 LD->getAlignment() >= 16;
124 def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>;
125 def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>;
126 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
127 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
128 def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
129 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
130 def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
132 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
134 // FIXME: 8 byte alignment for mmx reads is not required
135 def memop64 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
136 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
137 return LD->getExtensionType() == ISD::NON_EXTLOAD &&
138 LD->getAddressingMode() == ISD::UNINDEXED &&
139 LD->getAlignment() >= 8;
143 def memopv8i8 : PatFrag<(ops node:$ptr), (v8i8 (memop64 node:$ptr))>;
144 def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
145 def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
146 def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
148 def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
149 def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
150 def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
151 def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
152 def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
153 def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
155 def fp32imm0 : PatLeaf<(f32 fpimm), [{
156 return N->isExactlyValue(+0.0);
159 def PSxLDQ_imm : SDNodeXForm<imm, [{
160 // Transformation function: imm >> 3
161 return getI32Imm(N->getValue() >> 3);
164 // SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
166 def SHUFFLE_get_shuf_imm : SDNodeXForm<build_vector, [{
167 return getI8Imm(X86::getShuffleSHUFImmediate(N));
170 // SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to
172 def SHUFFLE_get_pshufhw_imm : SDNodeXForm<build_vector, [{
173 return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
176 // SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to
178 def SHUFFLE_get_pshuflw_imm : SDNodeXForm<build_vector, [{
179 return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
182 def SSE_splat_mask : PatLeaf<(build_vector), [{
183 return X86::isSplatMask(N);
184 }], SHUFFLE_get_shuf_imm>;
186 def SSE_splat_lo_mask : PatLeaf<(build_vector), [{
187 return X86::isSplatLoMask(N);
190 def MOVHLPS_shuffle_mask : PatLeaf<(build_vector), [{
191 return X86::isMOVHLPSMask(N);
194 def MOVHLPS_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
195 return X86::isMOVHLPS_v_undef_Mask(N);
198 def MOVHP_shuffle_mask : PatLeaf<(build_vector), [{
199 return X86::isMOVHPMask(N);
202 def MOVLP_shuffle_mask : PatLeaf<(build_vector), [{
203 return X86::isMOVLPMask(N);
206 def MOVL_shuffle_mask : PatLeaf<(build_vector), [{
207 return X86::isMOVLMask(N);
210 def MOVSHDUP_shuffle_mask : PatLeaf<(build_vector), [{
211 return X86::isMOVSHDUPMask(N);
214 def MOVSLDUP_shuffle_mask : PatLeaf<(build_vector), [{
215 return X86::isMOVSLDUPMask(N);
218 def UNPCKL_shuffle_mask : PatLeaf<(build_vector), [{
219 return X86::isUNPCKLMask(N);
222 def UNPCKH_shuffle_mask : PatLeaf<(build_vector), [{
223 return X86::isUNPCKHMask(N);
226 def UNPCKL_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
227 return X86::isUNPCKL_v_undef_Mask(N);
230 def UNPCKH_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
231 return X86::isUNPCKH_v_undef_Mask(N);
234 def PSHUFD_shuffle_mask : PatLeaf<(build_vector), [{
235 return X86::isPSHUFDMask(N);
236 }], SHUFFLE_get_shuf_imm>;
238 def PSHUFHW_shuffle_mask : PatLeaf<(build_vector), [{
239 return X86::isPSHUFHWMask(N);
240 }], SHUFFLE_get_pshufhw_imm>;
242 def PSHUFLW_shuffle_mask : PatLeaf<(build_vector), [{
243 return X86::isPSHUFLWMask(N);
244 }], SHUFFLE_get_pshuflw_imm>;
246 def SHUFP_unary_shuffle_mask : PatLeaf<(build_vector), [{
247 return X86::isPSHUFDMask(N);
248 }], SHUFFLE_get_shuf_imm>;
250 def SHUFP_shuffle_mask : PatLeaf<(build_vector), [{
251 return X86::isSHUFPMask(N);
252 }], SHUFFLE_get_shuf_imm>;
254 def PSHUFD_binary_shuffle_mask : PatLeaf<(build_vector), [{
255 return X86::isSHUFPMask(N);
256 }], SHUFFLE_get_shuf_imm>;
258 //===----------------------------------------------------------------------===//
259 // SSE scalar FP Instructions
260 //===----------------------------------------------------------------------===//
262 // CMOV* - Used to implement the SSE SELECT DAG operation. Expanded by the
263 // scheduler into a branch sequence.
264 // These are expanded by the scheduler.
265 let Uses = [EFLAGS], usesCustomDAGSchedInserter = 1 in {
266 def CMOV_FR32 : I<0, Pseudo,
267 (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
268 "#CMOV_FR32 PSEUDO!",
269 [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
271 def CMOV_FR64 : I<0, Pseudo,
272 (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
273 "#CMOV_FR64 PSEUDO!",
274 [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
276 def CMOV_V4F32 : I<0, Pseudo,
277 (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
278 "#CMOV_V4F32 PSEUDO!",
280 (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
282 def CMOV_V2F64 : I<0, Pseudo,
283 (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
284 "#CMOV_V2F64 PSEUDO!",
286 (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
288 def CMOV_V2I64 : I<0, Pseudo,
289 (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
290 "#CMOV_V2I64 PSEUDO!",
292 (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
296 //===----------------------------------------------------------------------===//
298 //===----------------------------------------------------------------------===//
301 let neverHasSideEffects = 1 in
302 def MOVSSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
303 "movss\t{$src, $dst|$dst, $src}", []>;
304 let isSimpleLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
305 def MOVSSrm : SSI<0x10, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
306 "movss\t{$src, $dst|$dst, $src}",
307 [(set FR32:$dst, (loadf32 addr:$src))]>;
308 def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
309 "movss\t{$src, $dst|$dst, $src}",
310 [(store FR32:$src, addr:$dst)]>;
312 // Conversion instructions
313 def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
314 "cvttss2si\t{$src, $dst|$dst, $src}",
315 [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
316 def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
317 "cvttss2si\t{$src, $dst|$dst, $src}",
318 [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
319 def CVTSI2SSrr : SSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
320 "cvtsi2ss\t{$src, $dst|$dst, $src}",
321 [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
322 def CVTSI2SSrm : SSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
323 "cvtsi2ss\t{$src, $dst|$dst, $src}",
324 [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
326 // Match intrinsics which expect XMM operand(s).
327 def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
328 "cvtss2si\t{$src, $dst|$dst, $src}",
329 [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
330 def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
331 "cvtss2si\t{$src, $dst|$dst, $src}",
332 [(set GR32:$dst, (int_x86_sse_cvtss2si
333 (load addr:$src)))]>;
335 // Match intrinisics which expect MM and XMM operand(s).
336 def Int_CVTPS2PIrr : PSI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
337 "cvtps2pi\t{$src, $dst|$dst, $src}",
338 [(set VR64:$dst, (int_x86_sse_cvtps2pi VR128:$src))]>;
339 def Int_CVTPS2PIrm : PSI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
340 "cvtps2pi\t{$src, $dst|$dst, $src}",
341 [(set VR64:$dst, (int_x86_sse_cvtps2pi
342 (load addr:$src)))]>;
343 def Int_CVTTPS2PIrr: PSI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
344 "cvttps2pi\t{$src, $dst|$dst, $src}",
345 [(set VR64:$dst, (int_x86_sse_cvttps2pi VR128:$src))]>;
346 def Int_CVTTPS2PIrm: PSI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
347 "cvttps2pi\t{$src, $dst|$dst, $src}",
348 [(set VR64:$dst, (int_x86_sse_cvttps2pi
349 (load addr:$src)))]>;
350 let Constraints = "$src1 = $dst" in {
351 def Int_CVTPI2PSrr : PSI<0x2A, MRMSrcReg,
352 (outs VR128:$dst), (ins VR128:$src1, VR64:$src2),
353 "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
354 [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
356 def Int_CVTPI2PSrm : PSI<0x2A, MRMSrcMem,
357 (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
358 "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
359 [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
360 (load addr:$src2)))]>;
363 // Aliases for intrinsics
364 def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
365 "cvttss2si\t{$src, $dst|$dst, $src}",
367 (int_x86_sse_cvttss2si VR128:$src))]>;
368 def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
369 "cvttss2si\t{$src, $dst|$dst, $src}",
371 (int_x86_sse_cvttss2si(load addr:$src)))]>;
373 let Constraints = "$src1 = $dst" in {
374 def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
375 (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
376 "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
377 [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
379 def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
380 (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
381 "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
382 [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
383 (loadi32 addr:$src2)))]>;
386 // Comparison instructions
387 let Constraints = "$src1 = $dst" in {
388 let neverHasSideEffects = 1 in
389 def CMPSSrr : SSIi8<0xC2, MRMSrcReg,
390 (outs FR32:$dst), (ins FR32:$src1, FR32:$src, SSECC:$cc),
391 "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
392 let neverHasSideEffects = 1, mayLoad = 1 in
393 def CMPSSrm : SSIi8<0xC2, MRMSrcMem,
394 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, SSECC:$cc),
395 "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
398 let Defs = [EFLAGS] in {
399 def UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins FR32:$src1, FR32:$src2),
400 "ucomiss\t{$src2, $src1|$src1, $src2}",
401 [(X86cmp FR32:$src1, FR32:$src2), (implicit EFLAGS)]>;
402 def UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs), (ins FR32:$src1, f32mem:$src2),
403 "ucomiss\t{$src2, $src1|$src1, $src2}",
404 [(X86cmp FR32:$src1, (loadf32 addr:$src2)),
408 // Aliases to match intrinsics which expect XMM operand(s).
409 let Constraints = "$src1 = $dst" in {
410 def Int_CMPSSrr : SSIi8<0xC2, MRMSrcReg,
411 (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
412 "cmp${cc}ss\t{$src, $dst|$dst, $src}",
413 [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
414 VR128:$src, imm:$cc))]>;
415 def Int_CMPSSrm : SSIi8<0xC2, MRMSrcMem,
416 (outs VR128:$dst), (ins VR128:$src1, f32mem:$src, SSECC:$cc),
417 "cmp${cc}ss\t{$src, $dst|$dst, $src}",
418 [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
419 (load addr:$src), imm:$cc))]>;
422 let Defs = [EFLAGS] in {
423 def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs),
424 (ins VR128:$src1, VR128:$src2),
425 "ucomiss\t{$src2, $src1|$src1, $src2}",
426 [(X86ucomi (v4f32 VR128:$src1), VR128:$src2),
428 def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs),
429 (ins VR128:$src1, f128mem:$src2),
430 "ucomiss\t{$src2, $src1|$src1, $src2}",
431 [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2)),
434 def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (outs),
435 (ins VR128:$src1, VR128:$src2),
436 "comiss\t{$src2, $src1|$src1, $src2}",
437 [(X86comi (v4f32 VR128:$src1), VR128:$src2),
439 def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (outs),
440 (ins VR128:$src1, f128mem:$src2),
441 "comiss\t{$src2, $src1|$src1, $src2}",
442 [(X86comi (v4f32 VR128:$src1), (load addr:$src2)),
446 // Aliases of packed SSE1 instructions for scalar use. These all have names that
449 // Alias instructions that map fld0 to pxor for sse.
450 let isReMaterializable = 1 in
451 def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins),
452 "pxor\t$dst, $dst", [(set FR32:$dst, fp32imm0)]>,
453 Requires<[HasSSE1]>, TB, OpSize;
455 // Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
457 let neverHasSideEffects = 1 in
458 def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
459 "movaps\t{$src, $dst|$dst, $src}", []>;
461 // Alias instruction to load FR32 from f128mem using movaps. Upper bits are
463 let isSimpleLoad = 1 in
464 def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
465 "movaps\t{$src, $dst|$dst, $src}",
466 [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
468 // Alias bitwise logical operations using SSE logical ops on packed FP values.
469 let Constraints = "$src1 = $dst" in {
470 let isCommutable = 1 in {
471 def FsANDPSrr : PSI<0x54, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
472 "andps\t{$src2, $dst|$dst, $src2}",
473 [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>;
474 def FsORPSrr : PSI<0x56, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
475 "orps\t{$src2, $dst|$dst, $src2}",
476 [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>;
477 def FsXORPSrr : PSI<0x57, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
478 "xorps\t{$src2, $dst|$dst, $src2}",
479 [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>;
482 def FsANDPSrm : PSI<0x54, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
483 "andps\t{$src2, $dst|$dst, $src2}",
484 [(set FR32:$dst, (X86fand FR32:$src1,
485 (memopfsf32 addr:$src2)))]>;
486 def FsORPSrm : PSI<0x56, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
487 "orps\t{$src2, $dst|$dst, $src2}",
488 [(set FR32:$dst, (X86for FR32:$src1,
489 (memopfsf32 addr:$src2)))]>;
490 def FsXORPSrm : PSI<0x57, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
491 "xorps\t{$src2, $dst|$dst, $src2}",
492 [(set FR32:$dst, (X86fxor FR32:$src1,
493 (memopfsf32 addr:$src2)))]>;
494 let neverHasSideEffects = 1 in {
495 def FsANDNPSrr : PSI<0x55, MRMSrcReg,
496 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
497 "andnps\t{$src2, $dst|$dst, $src2}", []>;
500 def FsANDNPSrm : PSI<0x55, MRMSrcMem,
501 (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
502 "andnps\t{$src2, $dst|$dst, $src2}", []>;
506 /// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
508 /// In addition, we also have a special variant of the scalar form here to
509 /// represent the associated intrinsic operation. This form is unlike the
510 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
511 /// and leaves the top elements undefined.
513 /// These three forms can each be reg+reg or reg+mem, so there are a total of
514 /// six "instructions".
516 let Constraints = "$src1 = $dst" in {
517 multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
518 SDNode OpNode, Intrinsic F32Int,
519 bit Commutable = 0> {
520 // Scalar operation, reg+reg.
521 def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
522 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
523 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
524 let isCommutable = Commutable;
527 // Scalar operation, reg+mem.
528 def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2),
529 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
530 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
532 // Vector operation, reg+reg.
533 def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
534 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
535 [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
536 let isCommutable = Commutable;
539 // Vector operation, reg+mem.
540 def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
541 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
542 [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
544 // Intrinsic operation, reg+reg.
545 def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
546 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
547 [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
548 let isCommutable = Commutable;
551 // Intrinsic operation, reg+mem.
552 def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
553 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
554 [(set VR128:$dst, (F32Int VR128:$src1,
555 sse_load_f32:$src2))]>;
559 // Arithmetic instructions
560 defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
561 defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
562 defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
563 defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
565 /// sse1_fp_binop_rm - Other SSE1 binops
567 /// This multiclass is like basic_sse1_fp_binop_rm, with the addition of
568 /// instructions for a full-vector intrinsic form. Operations that map
569 /// onto C operators don't use this form since they just use the plain
570 /// vector form instead of having a separate vector intrinsic form.
572 /// This provides a total of eight "instructions".
574 let Constraints = "$src1 = $dst" in {
575 multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
579 bit Commutable = 0> {
581 // Scalar operation, reg+reg.
582 def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
583 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
584 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
585 let isCommutable = Commutable;
588 // Scalar operation, reg+mem.
589 def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2),
590 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
591 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
593 // Vector operation, reg+reg.
594 def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
595 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
596 [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
597 let isCommutable = Commutable;
600 // Vector operation, reg+mem.
601 def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
602 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
603 [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
605 // Intrinsic operation, reg+reg.
606 def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
607 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
608 [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
609 let isCommutable = Commutable;
612 // Intrinsic operation, reg+mem.
613 def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
614 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
615 [(set VR128:$dst, (F32Int VR128:$src1,
616 sse_load_f32:$src2))]>;
618 // Vector intrinsic operation, reg+reg.
619 def PSrr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
620 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
621 [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
622 let isCommutable = Commutable;
625 // Vector intrinsic operation, reg+mem.
626 def PSrm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
627 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
628 [(set VR128:$dst, (V4F32Int VR128:$src1, (load addr:$src2)))]>;
632 defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax,
633 int_x86_sse_max_ss, int_x86_sse_max_ps>;
634 defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
635 int_x86_sse_min_ss, int_x86_sse_min_ps>;
637 //===----------------------------------------------------------------------===//
638 // SSE packed FP Instructions
641 let neverHasSideEffects = 1 in
642 def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
643 "movaps\t{$src, $dst|$dst, $src}", []>;
644 let isSimpleLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
645 def MOVAPSrm : PSI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
646 "movaps\t{$src, $dst|$dst, $src}",
647 [(set VR128:$dst, (alignedloadv4f32 addr:$src))]>;
649 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
650 "movaps\t{$src, $dst|$dst, $src}",
651 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
653 let neverHasSideEffects = 1 in
654 def MOVUPSrr : PSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
655 "movups\t{$src, $dst|$dst, $src}", []>;
656 let isSimpleLoad = 1 in
657 def MOVUPSrm : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
658 "movups\t{$src, $dst|$dst, $src}",
659 [(set VR128:$dst, (loadv4f32 addr:$src))]>;
660 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
661 "movups\t{$src, $dst|$dst, $src}",
662 [(store (v4f32 VR128:$src), addr:$dst)]>;
664 // Intrinsic forms of MOVUPS load and store
665 let isSimpleLoad = 1 in
666 def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
667 "movups\t{$src, $dst|$dst, $src}",
668 [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
669 def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
670 "movups\t{$src, $dst|$dst, $src}",
671 [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
673 let Constraints = "$src1 = $dst" in {
674 let AddedComplexity = 20 in {
675 def MOVLPSrm : PSI<0x12, MRMSrcMem,
676 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
677 "movlps\t{$src2, $dst|$dst, $src2}",
679 (v4f32 (vector_shuffle VR128:$src1,
680 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
681 MOVLP_shuffle_mask)))]>;
682 def MOVHPSrm : PSI<0x16, MRMSrcMem,
683 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
684 "movhps\t{$src2, $dst|$dst, $src2}",
686 (v4f32 (vector_shuffle VR128:$src1,
687 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
688 MOVHP_shuffle_mask)))]>;
690 } // Constraints = "$src1 = $dst"
693 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
694 "movlps\t{$src, $dst|$dst, $src}",
695 [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
696 (iPTR 0))), addr:$dst)]>;
698 // v2f64 extract element 1 is always custom lowered to unpack high to low
699 // and extract element 0 so the non-store version isn't too horrible.
700 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
701 "movhps\t{$src, $dst|$dst, $src}",
702 [(store (f64 (vector_extract
703 (v2f64 (vector_shuffle
704 (bc_v2f64 (v4f32 VR128:$src)), (undef),
705 UNPCKH_shuffle_mask)), (iPTR 0))),
708 let Constraints = "$src1 = $dst" in {
709 let AddedComplexity = 15 in {
710 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
711 "movlhps\t{$src2, $dst|$dst, $src2}",
713 (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
714 MOVHP_shuffle_mask)))]>;
716 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
717 "movhlps\t{$src2, $dst|$dst, $src2}",
719 (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
720 MOVHLPS_shuffle_mask)))]>;
722 } // Constraints = "$src1 = $dst"
728 /// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
730 /// In addition, we also have a special variant of the scalar form here to
731 /// represent the associated intrinsic operation. This form is unlike the
732 /// plain scalar form, in that it takes an entire vector (instead of a
733 /// scalar) and leaves the top elements undefined.
735 /// And, we have a special variant form for a full-vector intrinsic form.
737 /// These four forms can each have a reg or a mem operand, so there are a
738 /// total of eight "instructions".
740 multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
744 bit Commutable = 0> {
745 // Scalar operation, reg.
746 def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
747 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
748 [(set FR32:$dst, (OpNode FR32:$src))]> {
749 let isCommutable = Commutable;
752 // Scalar operation, mem.
753 def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
754 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
755 [(set FR32:$dst, (OpNode (load addr:$src)))]>;
757 // Vector operation, reg.
758 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
759 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
760 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
761 let isCommutable = Commutable;
764 // Vector operation, mem.
765 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
766 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
767 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
769 // Intrinsic operation, reg.
770 def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
771 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
772 [(set VR128:$dst, (F32Int VR128:$src))]> {
773 let isCommutable = Commutable;
776 // Intrinsic operation, mem.
777 def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
778 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
779 [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
781 // Vector intrinsic operation, reg
782 def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
783 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
784 [(set VR128:$dst, (V4F32Int VR128:$src))]> {
785 let isCommutable = Commutable;
788 // Vector intrinsic operation, mem
789 def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
790 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
791 [(set VR128:$dst, (V4F32Int (load addr:$src)))]>;
795 defm SQRT : sse1_fp_unop_rm<0x51, "sqrt", fsqrt,
796 int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
798 // Reciprocal approximations. Note that these typically require refinement
799 // in order to obtain suitable precision.
800 defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
801 int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
802 defm RCP : sse1_fp_unop_rm<0x53, "rcp", X86frcp,
803 int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
806 let Constraints = "$src1 = $dst" in {
807 let isCommutable = 1 in {
808 def ANDPSrr : PSI<0x54, MRMSrcReg,
809 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
810 "andps\t{$src2, $dst|$dst, $src2}",
811 [(set VR128:$dst, (v2i64
812 (and VR128:$src1, VR128:$src2)))]>;
813 def ORPSrr : PSI<0x56, MRMSrcReg,
814 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
815 "orps\t{$src2, $dst|$dst, $src2}",
816 [(set VR128:$dst, (v2i64
817 (or VR128:$src1, VR128:$src2)))]>;
818 def XORPSrr : PSI<0x57, MRMSrcReg,
819 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
820 "xorps\t{$src2, $dst|$dst, $src2}",
821 [(set VR128:$dst, (v2i64
822 (xor VR128:$src1, VR128:$src2)))]>;
825 def ANDPSrm : PSI<0x54, MRMSrcMem,
826 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
827 "andps\t{$src2, $dst|$dst, $src2}",
828 [(set VR128:$dst, (and (bc_v2i64 (v4f32 VR128:$src1)),
829 (memopv2i64 addr:$src2)))]>;
830 def ORPSrm : PSI<0x56, MRMSrcMem,
831 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
832 "orps\t{$src2, $dst|$dst, $src2}",
833 [(set VR128:$dst, (or (bc_v2i64 (v4f32 VR128:$src1)),
834 (memopv2i64 addr:$src2)))]>;
835 def XORPSrm : PSI<0x57, MRMSrcMem,
836 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
837 "xorps\t{$src2, $dst|$dst, $src2}",
838 [(set VR128:$dst, (xor (bc_v2i64 (v4f32 VR128:$src1)),
839 (memopv2i64 addr:$src2)))]>;
840 def ANDNPSrr : PSI<0x55, MRMSrcReg,
841 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
842 "andnps\t{$src2, $dst|$dst, $src2}",
844 (v2i64 (and (xor VR128:$src1,
845 (bc_v2i64 (v4i32 immAllOnesV))),
847 def ANDNPSrm : PSI<0x55, MRMSrcMem,
848 (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
849 "andnps\t{$src2, $dst|$dst, $src2}",
851 (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)),
852 (bc_v2i64 (v4i32 immAllOnesV))),
853 (memopv2i64 addr:$src2))))]>;
856 let Constraints = "$src1 = $dst" in {
857 def CMPPSrri : PSIi8<0xC2, MRMSrcReg,
858 (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
859 "cmp${cc}ps\t{$src, $dst|$dst, $src}",
860 [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
861 VR128:$src, imm:$cc))]>;
862 def CMPPSrmi : PSIi8<0xC2, MRMSrcMem,
863 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
864 "cmp${cc}ps\t{$src, $dst|$dst, $src}",
865 [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
866 (load addr:$src), imm:$cc))]>;
869 // Shuffle and unpack instructions
870 let Constraints = "$src1 = $dst" in {
871 let isConvertibleToThreeAddress = 1 in // Convert to pshufd
872 def SHUFPSrri : PSIi8<0xC6, MRMSrcReg,
873 (outs VR128:$dst), (ins VR128:$src1,
874 VR128:$src2, i32i8imm:$src3),
875 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
877 (v4f32 (vector_shuffle
878 VR128:$src1, VR128:$src2,
879 SHUFP_shuffle_mask:$src3)))]>;
880 def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem,
881 (outs VR128:$dst), (ins VR128:$src1,
882 f128mem:$src2, i32i8imm:$src3),
883 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
885 (v4f32 (vector_shuffle
886 VR128:$src1, (memopv4f32 addr:$src2),
887 SHUFP_shuffle_mask:$src3)))]>;
889 let AddedComplexity = 10 in {
890 def UNPCKHPSrr : PSI<0x15, MRMSrcReg,
891 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
892 "unpckhps\t{$src2, $dst|$dst, $src2}",
894 (v4f32 (vector_shuffle
895 VR128:$src1, VR128:$src2,
896 UNPCKH_shuffle_mask)))]>;
897 def UNPCKHPSrm : PSI<0x15, MRMSrcMem,
898 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
899 "unpckhps\t{$src2, $dst|$dst, $src2}",
901 (v4f32 (vector_shuffle
902 VR128:$src1, (memopv4f32 addr:$src2),
903 UNPCKH_shuffle_mask)))]>;
905 def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
906 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
907 "unpcklps\t{$src2, $dst|$dst, $src2}",
909 (v4f32 (vector_shuffle
910 VR128:$src1, VR128:$src2,
911 UNPCKL_shuffle_mask)))]>;
912 def UNPCKLPSrm : PSI<0x14, MRMSrcMem,
913 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
914 "unpcklps\t{$src2, $dst|$dst, $src2}",
916 (v4f32 (vector_shuffle
917 VR128:$src1, (memopv4f32 addr:$src2),
918 UNPCKL_shuffle_mask)))]>;
920 } // Constraints = "$src1 = $dst"
923 def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
924 "movmskps\t{$src, $dst|$dst, $src}",
925 [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
926 def MOVMSKPDrr : PSI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
927 "movmskpd\t{$src, $dst|$dst, $src}",
928 [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
930 // Prefetch intrinsic.
931 def PREFETCHT0 : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
932 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>;
933 def PREFETCHT1 : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
934 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>;
935 def PREFETCHT2 : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
936 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>;
937 def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
938 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
940 // Non-temporal stores
941 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
942 "movntps\t{$src, $dst|$dst, $src}",
943 [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
945 // Load, store, and memory fence
946 def SFENCE : PSI<0xAE, MRM7m, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>;
949 def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
950 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
951 def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
952 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
954 // Alias instructions that map zero vector to pxor / xorp* for sse.
955 let isReMaterializable = 1 in
956 def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins),
958 [(set VR128:$dst, (v4i32 immAllZerosV))]>;
960 let Predicates = [HasSSE1] in {
961 def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
962 def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
963 def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
964 def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
965 def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
968 // FR32 to 128-bit vector conversion.
969 def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR32:$src),
970 "movss\t{$src, $dst|$dst, $src}",
972 (v4f32 (scalar_to_vector FR32:$src)))]>;
973 def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
974 "movss\t{$src, $dst|$dst, $src}",
976 (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>;
978 // FIXME: may not be able to eliminate this movss with coalescing the src and
979 // dest register classes are different. We really want to write this pattern
981 // def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
983 def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins VR128:$src),
984 "movss\t{$src, $dst|$dst, $src}",
985 [(set FR32:$dst, (vector_extract (v4f32 VR128:$src),
987 def MOVPS2SSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
988 "movss\t{$src, $dst|$dst, $src}",
989 [(store (f32 (vector_extract (v4f32 VR128:$src),
990 (iPTR 0))), addr:$dst)]>;
993 // Move to lower bits of a VR128, leaving upper bits alone.
994 // Three operand (but two address) aliases.
995 let Constraints = "$src1 = $dst" in {
996 let neverHasSideEffects = 1 in
997 def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
998 (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
999 "movss\t{$src2, $dst|$dst, $src2}", []>;
1001 let AddedComplexity = 15 in
1002 def MOVLPSrr : SSI<0x10, MRMSrcReg,
1003 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1004 "movss\t{$src2, $dst|$dst, $src2}",
1006 (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
1007 MOVL_shuffle_mask)))]>;
1010 // Move to lower bits of a VR128 and zeroing upper bits.
1011 // Loading from memory automatically zeroing upper bits.
1012 let AddedComplexity = 20 in
1013 def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
1014 "movss\t{$src, $dst|$dst, $src}",
1015 [(set VR128:$dst, (v4f32 (X86vzmovl (v4f32 (scalar_to_vector
1016 (loadf32 addr:$src))))))]>;
1018 def : Pat<(v4f32 (X86vzmovl (memopv4f32 addr:$src))),
1019 (MOVZSS2PSrm addr:$src)>;
1021 //===----------------------------------------------------------------------===//
1022 // SSE2 Instructions
1023 //===----------------------------------------------------------------------===//
1025 // Move Instructions
1026 let neverHasSideEffects = 1 in
1027 def MOVSDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
1028 "movsd\t{$src, $dst|$dst, $src}", []>;
1029 let isSimpleLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
1030 def MOVSDrm : SDI<0x10, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
1031 "movsd\t{$src, $dst|$dst, $src}",
1032 [(set FR64:$dst, (loadf64 addr:$src))]>;
1033 def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
1034 "movsd\t{$src, $dst|$dst, $src}",
1035 [(store FR64:$src, addr:$dst)]>;
1037 // Conversion instructions
1038 def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
1039 "cvttsd2si\t{$src, $dst|$dst, $src}",
1040 [(set GR32:$dst, (fp_to_sint FR64:$src))]>;
1041 def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src),
1042 "cvttsd2si\t{$src, $dst|$dst, $src}",
1043 [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
1044 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1045 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1046 [(set FR32:$dst, (fround FR64:$src))]>;
1047 def CVTSD2SSrm : SDI<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1048 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1049 [(set FR32:$dst, (fround (loadf64 addr:$src)))]>;
1050 def CVTSI2SDrr : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src),
1051 "cvtsi2sd\t{$src, $dst|$dst, $src}",
1052 [(set FR64:$dst, (sint_to_fp GR32:$src))]>;
1053 def CVTSI2SDrm : SDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i32mem:$src),
1054 "cvtsi2sd\t{$src, $dst|$dst, $src}",
1055 [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
1057 // SSE2 instructions with XS prefix
1058 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1059 "cvtss2sd\t{$src, $dst|$dst, $src}",
1060 [(set FR64:$dst, (fextend FR32:$src))]>, XS,
1061 Requires<[HasSSE2]>;
1062 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1063 "cvtss2sd\t{$src, $dst|$dst, $src}",
1064 [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
1065 Requires<[HasSSE2]>;
1067 // Match intrinsics which expect XMM operand(s).
1068 def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
1069 "cvtsd2si\t{$src, $dst|$dst, $src}",
1070 [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>;
1071 def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
1072 "cvtsd2si\t{$src, $dst|$dst, $src}",
1073 [(set GR32:$dst, (int_x86_sse2_cvtsd2si
1074 (load addr:$src)))]>;
1076 // Match intrinisics which expect MM and XMM operand(s).
1077 def Int_CVTPD2PIrr : PDI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
1078 "cvtpd2pi\t{$src, $dst|$dst, $src}",
1079 [(set VR64:$dst, (int_x86_sse_cvtpd2pi VR128:$src))]>;
1080 def Int_CVTPD2PIrm : PDI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
1081 "cvtpd2pi\t{$src, $dst|$dst, $src}",
1082 [(set VR64:$dst, (int_x86_sse_cvtpd2pi
1083 (load addr:$src)))]>;
1084 def Int_CVTTPD2PIrr: PDI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
1085 "cvttpd2pi\t{$src, $dst|$dst, $src}",
1086 [(set VR64:$dst, (int_x86_sse_cvttpd2pi VR128:$src))]>;
1087 def Int_CVTTPD2PIrm: PDI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
1088 "cvttpd2pi\t{$src, $dst|$dst, $src}",
1089 [(set VR64:$dst, (int_x86_sse_cvttpd2pi
1090 (load addr:$src)))]>;
1091 def Int_CVTPI2PDrr : PDI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
1092 "cvtpi2pd\t{$src, $dst|$dst, $src}",
1093 [(set VR128:$dst, (int_x86_sse_cvtpi2pd VR64:$src))]>;
1094 def Int_CVTPI2PDrm : PDI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1095 "cvtpi2pd\t{$src, $dst|$dst, $src}",
1096 [(set VR128:$dst, (int_x86_sse_cvtpi2pd
1097 (load addr:$src)))]>;
1099 // Aliases for intrinsics
1100 def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
1101 "cvttsd2si\t{$src, $dst|$dst, $src}",
1103 (int_x86_sse2_cvttsd2si VR128:$src))]>;
1104 def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
1105 "cvttsd2si\t{$src, $dst|$dst, $src}",
1106 [(set GR32:$dst, (int_x86_sse2_cvttsd2si
1107 (load addr:$src)))]>;
1109 // Comparison instructions
1110 let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
1111 def CMPSDrr : SDIi8<0xC2, MRMSrcReg,
1112 (outs FR64:$dst), (ins FR64:$src1, FR64:$src, SSECC:$cc),
1113 "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
1115 def CMPSDrm : SDIi8<0xC2, MRMSrcMem,
1116 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, SSECC:$cc),
1117 "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
1120 let Defs = [EFLAGS] in {
1121 def UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins FR64:$src1, FR64:$src2),
1122 "ucomisd\t{$src2, $src1|$src1, $src2}",
1123 [(X86cmp FR64:$src1, FR64:$src2), (implicit EFLAGS)]>;
1124 def UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs), (ins FR64:$src1, f64mem:$src2),
1125 "ucomisd\t{$src2, $src1|$src1, $src2}",
1126 [(X86cmp FR64:$src1, (loadf64 addr:$src2)),
1127 (implicit EFLAGS)]>;
1130 // Aliases to match intrinsics which expect XMM operand(s).
1131 let Constraints = "$src1 = $dst" in {
1132 def Int_CMPSDrr : SDIi8<0xC2, MRMSrcReg,
1133 (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
1134 "cmp${cc}sd\t{$src, $dst|$dst, $src}",
1135 [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
1136 VR128:$src, imm:$cc))]>;
1137 def Int_CMPSDrm : SDIi8<0xC2, MRMSrcMem,
1138 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src, SSECC:$cc),
1139 "cmp${cc}sd\t{$src, $dst|$dst, $src}",
1140 [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
1141 (load addr:$src), imm:$cc))]>;
1144 let Defs = [EFLAGS] in {
1145 def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
1146 "ucomisd\t{$src2, $src1|$src1, $src2}",
1147 [(X86ucomi (v2f64 VR128:$src1), (v2f64 VR128:$src2)),
1148 (implicit EFLAGS)]>;
1149 def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
1150 "ucomisd\t{$src2, $src1|$src1, $src2}",
1151 [(X86ucomi (v2f64 VR128:$src1), (load addr:$src2)),
1152 (implicit EFLAGS)]>;
1154 def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
1155 "comisd\t{$src2, $src1|$src1, $src2}",
1156 [(X86comi (v2f64 VR128:$src1), (v2f64 VR128:$src2)),
1157 (implicit EFLAGS)]>;
1158 def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
1159 "comisd\t{$src2, $src1|$src1, $src2}",
1160 [(X86comi (v2f64 VR128:$src1), (load addr:$src2)),
1161 (implicit EFLAGS)]>;
1164 // Aliases of packed SSE2 instructions for scalar use. These all have names that
1167 // Alias instructions that map fld0 to pxor for sse.
1168 let isReMaterializable = 1 in
1169 def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins),
1170 "pxor\t$dst, $dst", [(set FR64:$dst, fpimm0)]>,
1171 Requires<[HasSSE2]>, TB, OpSize;
1173 // Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
1175 let neverHasSideEffects = 1 in
1176 def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
1177 "movapd\t{$src, $dst|$dst, $src}", []>;
1179 // Alias instruction to load FR64 from f128mem using movapd. Upper bits are
1181 let isSimpleLoad = 1 in
1182 def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1183 "movapd\t{$src, $dst|$dst, $src}",
1184 [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
1186 // Alias bitwise logical operations using SSE logical ops on packed FP values.
1187 let Constraints = "$src1 = $dst" in {
1188 let isCommutable = 1 in {
1189 def FsANDPDrr : PDI<0x54, MRMSrcReg, (outs FR64:$dst),
1190 (ins FR64:$src1, FR64:$src2),
1191 "andpd\t{$src2, $dst|$dst, $src2}",
1192 [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>;
1193 def FsORPDrr : PDI<0x56, MRMSrcReg, (outs FR64:$dst),
1194 (ins FR64:$src1, FR64:$src2),
1195 "orpd\t{$src2, $dst|$dst, $src2}",
1196 [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>;
1197 def FsXORPDrr : PDI<0x57, MRMSrcReg, (outs FR64:$dst),
1198 (ins FR64:$src1, FR64:$src2),
1199 "xorpd\t{$src2, $dst|$dst, $src2}",
1200 [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>;
1203 def FsANDPDrm : PDI<0x54, MRMSrcMem, (outs FR64:$dst),
1204 (ins FR64:$src1, f128mem:$src2),
1205 "andpd\t{$src2, $dst|$dst, $src2}",
1206 [(set FR64:$dst, (X86fand FR64:$src1,
1207 (memopfsf64 addr:$src2)))]>;
1208 def FsORPDrm : PDI<0x56, MRMSrcMem, (outs FR64:$dst),
1209 (ins FR64:$src1, f128mem:$src2),
1210 "orpd\t{$src2, $dst|$dst, $src2}",
1211 [(set FR64:$dst, (X86for FR64:$src1,
1212 (memopfsf64 addr:$src2)))]>;
1213 def FsXORPDrm : PDI<0x57, MRMSrcMem, (outs FR64:$dst),
1214 (ins FR64:$src1, f128mem:$src2),
1215 "xorpd\t{$src2, $dst|$dst, $src2}",
1216 [(set FR64:$dst, (X86fxor FR64:$src1,
1217 (memopfsf64 addr:$src2)))]>;
1219 let neverHasSideEffects = 1 in {
1220 def FsANDNPDrr : PDI<0x55, MRMSrcReg,
1221 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
1222 "andnpd\t{$src2, $dst|$dst, $src2}", []>;
1224 def FsANDNPDrm : PDI<0x55, MRMSrcMem,
1225 (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
1226 "andnpd\t{$src2, $dst|$dst, $src2}", []>;
1230 /// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
1232 /// In addition, we also have a special variant of the scalar form here to
1233 /// represent the associated intrinsic operation. This form is unlike the
1234 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
1235 /// and leaves the top elements undefined.
1237 /// These three forms can each be reg+reg or reg+mem, so there are a total of
1238 /// six "instructions".
1240 let Constraints = "$src1 = $dst" in {
1241 multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
1242 SDNode OpNode, Intrinsic F64Int,
1243 bit Commutable = 0> {
1244 // Scalar operation, reg+reg.
1245 def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
1246 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
1247 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
1248 let isCommutable = Commutable;
1251 // Scalar operation, reg+mem.
1252 def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2),
1253 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
1254 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
1256 // Vector operation, reg+reg.
1257 def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1258 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
1259 [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
1260 let isCommutable = Commutable;
1263 // Vector operation, reg+mem.
1264 def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1265 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
1266 [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
1268 // Intrinsic operation, reg+reg.
1269 def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1270 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
1271 [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
1272 let isCommutable = Commutable;
1275 // Intrinsic operation, reg+mem.
1276 def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1277 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
1278 [(set VR128:$dst, (F64Int VR128:$src1,
1279 sse_load_f64:$src2))]>;
1283 // Arithmetic instructions
1284 defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
1285 defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
1286 defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
1287 defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
1289 /// sse2_fp_binop_rm - Other SSE2 binops
1291 /// This multiclass is like basic_sse2_fp_binop_rm, with the addition of
1292 /// instructions for a full-vector intrinsic form. Operations that map
1293 /// onto C operators don't use this form since they just use the plain
1294 /// vector form instead of having a separate vector intrinsic form.
1296 /// This provides a total of eight "instructions".
1298 let Constraints = "$src1 = $dst" in {
1299 multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
1303 bit Commutable = 0> {
1305 // Scalar operation, reg+reg.
1306 def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
1307 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
1308 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
1309 let isCommutable = Commutable;
1312 // Scalar operation, reg+mem.
1313 def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2),
1314 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
1315 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
1317 // Vector operation, reg+reg.
1318 def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1319 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
1320 [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
1321 let isCommutable = Commutable;
1324 // Vector operation, reg+mem.
1325 def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1326 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
1327 [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
1329 // Intrinsic operation, reg+reg.
1330 def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1331 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
1332 [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
1333 let isCommutable = Commutable;
1336 // Intrinsic operation, reg+mem.
1337 def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1338 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
1339 [(set VR128:$dst, (F64Int VR128:$src1,
1340 sse_load_f64:$src2))]>;
1342 // Vector intrinsic operation, reg+reg.
1343 def PDrr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1344 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
1345 [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
1346 let isCommutable = Commutable;
1349 // Vector intrinsic operation, reg+mem.
1350 def PDrm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1351 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
1352 [(set VR128:$dst, (V2F64Int VR128:$src1, (load addr:$src2)))]>;
1356 defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
1357 int_x86_sse2_max_sd, int_x86_sse2_max_pd>;
1358 defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
1359 int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
1361 //===----------------------------------------------------------------------===//
1362 // SSE packed FP Instructions
1364 // Move Instructions
1365 let neverHasSideEffects = 1 in
1366 def MOVAPDrr : PDI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1367 "movapd\t{$src, $dst|$dst, $src}", []>;
1368 let isSimpleLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
1369 def MOVAPDrm : PDI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1370 "movapd\t{$src, $dst|$dst, $src}",
1371 [(set VR128:$dst, (alignedloadv2f64 addr:$src))]>;
1373 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1374 "movapd\t{$src, $dst|$dst, $src}",
1375 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
1377 let neverHasSideEffects = 1 in
1378 def MOVUPDrr : PDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1379 "movupd\t{$src, $dst|$dst, $src}", []>;
1380 let isSimpleLoad = 1 in
1381 def MOVUPDrm : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1382 "movupd\t{$src, $dst|$dst, $src}",
1383 [(set VR128:$dst, (loadv2f64 addr:$src))]>;
1384 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1385 "movupd\t{$src, $dst|$dst, $src}",
1386 [(store (v2f64 VR128:$src), addr:$dst)]>;
1388 // Intrinsic forms of MOVUPD load and store
1389 def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1390 "movupd\t{$src, $dst|$dst, $src}",
1391 [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
1392 def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1393 "movupd\t{$src, $dst|$dst, $src}",
1394 [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
1396 let Constraints = "$src1 = $dst" in {
1397 let AddedComplexity = 20 in {
1398 def MOVLPDrm : PDI<0x12, MRMSrcMem,
1399 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1400 "movlpd\t{$src2, $dst|$dst, $src2}",
1402 (v2f64 (vector_shuffle VR128:$src1,
1403 (scalar_to_vector (loadf64 addr:$src2)),
1404 MOVLP_shuffle_mask)))]>;
1405 def MOVHPDrm : PDI<0x16, MRMSrcMem,
1406 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1407 "movhpd\t{$src2, $dst|$dst, $src2}",
1409 (v2f64 (vector_shuffle VR128:$src1,
1410 (scalar_to_vector (loadf64 addr:$src2)),
1411 MOVHP_shuffle_mask)))]>;
1412 } // AddedComplexity
1413 } // Constraints = "$src1 = $dst"
1415 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1416 "movlpd\t{$src, $dst|$dst, $src}",
1417 [(store (f64 (vector_extract (v2f64 VR128:$src),
1418 (iPTR 0))), addr:$dst)]>;
1420 // v2f64 extract element 1 is always custom lowered to unpack high to low
1421 // and extract element 0 so the non-store version isn't too horrible.
1422 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1423 "movhpd\t{$src, $dst|$dst, $src}",
1424 [(store (f64 (vector_extract
1425 (v2f64 (vector_shuffle VR128:$src, (undef),
1426 UNPCKH_shuffle_mask)), (iPTR 0))),
1429 // SSE2 instructions without OpSize prefix
1430 def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1431 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1432 [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
1433 TB, Requires<[HasSSE2]>;
1434 def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
1435 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1436 [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
1437 (bitconvert (memopv2i64 addr:$src))))]>,
1438 TB, Requires<[HasSSE2]>;
1440 // SSE2 instructions with XS prefix
1441 def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1442 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1443 [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
1444 XS, Requires<[HasSSE2]>;
1445 def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1446 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1447 [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
1448 (bitconvert (memopv2i64 addr:$src))))]>,
1449 XS, Requires<[HasSSE2]>;
1451 def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1452 "cvtps2dq\t{$src, $dst|$dst, $src}",
1453 [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
1454 def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1455 "cvtps2dq\t{$src, $dst|$dst, $src}",
1456 [(set VR128:$dst, (int_x86_sse2_cvtps2dq
1457 (load addr:$src)))]>;
1458 // SSE2 packed instructions with XS prefix
1459 def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1460 "cvttps2dq\t{$src, $dst|$dst, $src}",
1461 [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))]>,
1462 XS, Requires<[HasSSE2]>;
1463 def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1464 "cvttps2dq\t{$src, $dst|$dst, $src}",
1465 [(set VR128:$dst, (int_x86_sse2_cvttps2dq
1466 (load addr:$src)))]>,
1467 XS, Requires<[HasSSE2]>;
1469 // SSE2 packed instructions with XD prefix
1470 def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1471 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1472 [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
1473 XD, Requires<[HasSSE2]>;
1474 def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1475 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1476 [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
1477 (load addr:$src)))]>,
1478 XD, Requires<[HasSSE2]>;
1480 def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1481 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1482 [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
1483 def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1484 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1485 [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
1486 (load addr:$src)))]>;
1488 // SSE2 instructions without OpSize prefix
1489 def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1490 "cvtps2pd\t{$src, $dst|$dst, $src}",
1491 [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
1492 TB, Requires<[HasSSE2]>;
1493 def Int_CVTPS2PDrm : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins f64mem:$src),
1494 "cvtps2pd\t{$src, $dst|$dst, $src}",
1495 [(set VR128:$dst, (int_x86_sse2_cvtps2pd
1496 (load addr:$src)))]>,
1497 TB, Requires<[HasSSE2]>;
1499 def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1500 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1501 [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
1502 def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins f128mem:$src),
1503 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1504 [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
1505 (load addr:$src)))]>;
1507 // Match intrinsics which expect XMM operand(s).
1508 // Aliases for intrinsics
1509 let Constraints = "$src1 = $dst" in {
1510 def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg,
1511 (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
1512 "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
1513 [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
1515 def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem,
1516 (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
1517 "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
1518 [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
1519 (loadi32 addr:$src2)))]>;
1520 def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
1521 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1522 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1523 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
1525 def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
1526 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1527 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1528 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
1529 (load addr:$src2)))]>;
1530 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
1531 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1532 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1533 [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
1534 VR128:$src2))]>, XS,
1535 Requires<[HasSSE2]>;
1536 def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
1537 (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
1538 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1539 [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
1540 (load addr:$src2)))]>, XS,
1541 Requires<[HasSSE2]>;
1546 /// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
1548 /// In addition, we also have a special variant of the scalar form here to
1549 /// represent the associated intrinsic operation. This form is unlike the
1550 /// plain scalar form, in that it takes an entire vector (instead of a
1551 /// scalar) and leaves the top elements undefined.
1553 /// And, we have a special variant form for a full-vector intrinsic form.
1555 /// These four forms can each have a reg or a mem operand, so there are a
1556 /// total of eight "instructions".
1558 multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
1562 bit Commutable = 0> {
1563 // Scalar operation, reg.
1564 def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
1565 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
1566 [(set FR64:$dst, (OpNode FR64:$src))]> {
1567 let isCommutable = Commutable;
1570 // Scalar operation, mem.
1571 def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
1572 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
1573 [(set FR64:$dst, (OpNode (load addr:$src)))]>;
1575 // Vector operation, reg.
1576 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1577 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
1578 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
1579 let isCommutable = Commutable;
1582 // Vector operation, mem.
1583 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1584 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
1585 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
1587 // Intrinsic operation, reg.
1588 def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1589 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
1590 [(set VR128:$dst, (F64Int VR128:$src))]> {
1591 let isCommutable = Commutable;
1594 // Intrinsic operation, mem.
1595 def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
1596 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
1597 [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
1599 // Vector intrinsic operation, reg
1600 def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1601 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
1602 [(set VR128:$dst, (V2F64Int VR128:$src))]> {
1603 let isCommutable = Commutable;
1606 // Vector intrinsic operation, mem
1607 def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1608 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
1609 [(set VR128:$dst, (V2F64Int (load addr:$src)))]>;
1613 defm SQRT : sse2_fp_unop_rm<0x51, "sqrt", fsqrt,
1614 int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
1616 // There is no f64 version of the reciprocal approximation instructions.
1619 let Constraints = "$src1 = $dst" in {
1620 let isCommutable = 1 in {
1621 def ANDPDrr : PDI<0x54, MRMSrcReg,
1622 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1623 "andpd\t{$src2, $dst|$dst, $src2}",
1625 (and (bc_v2i64 (v2f64 VR128:$src1)),
1626 (bc_v2i64 (v2f64 VR128:$src2))))]>;
1627 def ORPDrr : PDI<0x56, MRMSrcReg,
1628 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1629 "orpd\t{$src2, $dst|$dst, $src2}",
1631 (or (bc_v2i64 (v2f64 VR128:$src1)),
1632 (bc_v2i64 (v2f64 VR128:$src2))))]>;
1633 def XORPDrr : PDI<0x57, MRMSrcReg,
1634 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1635 "xorpd\t{$src2, $dst|$dst, $src2}",
1637 (xor (bc_v2i64 (v2f64 VR128:$src1)),
1638 (bc_v2i64 (v2f64 VR128:$src2))))]>;
1641 def ANDPDrm : PDI<0x54, MRMSrcMem,
1642 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1643 "andpd\t{$src2, $dst|$dst, $src2}",
1645 (and (bc_v2i64 (v2f64 VR128:$src1)),
1646 (memopv2i64 addr:$src2)))]>;
1647 def ORPDrm : PDI<0x56, MRMSrcMem,
1648 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1649 "orpd\t{$src2, $dst|$dst, $src2}",
1651 (or (bc_v2i64 (v2f64 VR128:$src1)),
1652 (memopv2i64 addr:$src2)))]>;
1653 def XORPDrm : PDI<0x57, MRMSrcMem,
1654 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1655 "xorpd\t{$src2, $dst|$dst, $src2}",
1657 (xor (bc_v2i64 (v2f64 VR128:$src1)),
1658 (memopv2i64 addr:$src2)))]>;
1659 def ANDNPDrr : PDI<0x55, MRMSrcReg,
1660 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1661 "andnpd\t{$src2, $dst|$dst, $src2}",
1663 (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
1664 (bc_v2i64 (v2f64 VR128:$src2))))]>;
1665 def ANDNPDrm : PDI<0x55, MRMSrcMem,
1666 (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
1667 "andnpd\t{$src2, $dst|$dst, $src2}",
1669 (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
1670 (memopv2i64 addr:$src2)))]>;
1673 let Constraints = "$src1 = $dst" in {
1674 def CMPPDrri : PDIi8<0xC2, MRMSrcReg,
1675 (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
1676 "cmp${cc}pd\t{$src, $dst|$dst, $src}",
1677 [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
1678 VR128:$src, imm:$cc))]>;
1679 def CMPPDrmi : PDIi8<0xC2, MRMSrcMem,
1680 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
1681 "cmp${cc}pd\t{$src, $dst|$dst, $src}",
1682 [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
1683 (load addr:$src), imm:$cc))]>;
1686 // Shuffle and unpack instructions
1687 let Constraints = "$src1 = $dst" in {
1688 def SHUFPDrri : PDIi8<0xC6, MRMSrcReg,
1689 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3),
1690 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
1691 [(set VR128:$dst, (v2f64 (vector_shuffle
1692 VR128:$src1, VR128:$src2,
1693 SHUFP_shuffle_mask:$src3)))]>;
1694 def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem,
1695 (outs VR128:$dst), (ins VR128:$src1,
1696 f128mem:$src2, i8imm:$src3),
1697 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
1699 (v2f64 (vector_shuffle
1700 VR128:$src1, (memopv2f64 addr:$src2),
1701 SHUFP_shuffle_mask:$src3)))]>;
1703 let AddedComplexity = 10 in {
1704 def UNPCKHPDrr : PDI<0x15, MRMSrcReg,
1705 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1706 "unpckhpd\t{$src2, $dst|$dst, $src2}",
1708 (v2f64 (vector_shuffle
1709 VR128:$src1, VR128:$src2,
1710 UNPCKH_shuffle_mask)))]>;
1711 def UNPCKHPDrm : PDI<0x15, MRMSrcMem,
1712 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1713 "unpckhpd\t{$src2, $dst|$dst, $src2}",
1715 (v2f64 (vector_shuffle
1716 VR128:$src1, (memopv2f64 addr:$src2),
1717 UNPCKH_shuffle_mask)))]>;
1719 def UNPCKLPDrr : PDI<0x14, MRMSrcReg,
1720 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1721 "unpcklpd\t{$src2, $dst|$dst, $src2}",
1723 (v2f64 (vector_shuffle
1724 VR128:$src1, VR128:$src2,
1725 UNPCKL_shuffle_mask)))]>;
1726 def UNPCKLPDrm : PDI<0x14, MRMSrcMem,
1727 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1728 "unpcklpd\t{$src2, $dst|$dst, $src2}",
1730 (v2f64 (vector_shuffle
1731 VR128:$src1, (memopv2f64 addr:$src2),
1732 UNPCKL_shuffle_mask)))]>;
1733 } // AddedComplexity
1734 } // Constraints = "$src1 = $dst"
1737 //===----------------------------------------------------------------------===//
1738 // SSE integer instructions
1740 // Move Instructions
1741 let neverHasSideEffects = 1 in
1742 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1743 "movdqa\t{$src, $dst|$dst, $src}", []>;
1744 let isSimpleLoad = 1, mayLoad = 1 in
1745 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
1746 "movdqa\t{$src, $dst|$dst, $src}",
1747 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
1749 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
1750 "movdqa\t{$src, $dst|$dst, $src}",
1751 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
1752 let isSimpleLoad = 1, mayLoad = 1 in
1753 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
1754 "movdqu\t{$src, $dst|$dst, $src}",
1755 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
1756 XS, Requires<[HasSSE2]>;
1758 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
1759 "movdqu\t{$src, $dst|$dst, $src}",
1760 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
1761 XS, Requires<[HasSSE2]>;
1763 // Intrinsic forms of MOVDQU load and store
1764 let isSimpleLoad = 1 in
1765 def MOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
1766 "movdqu\t{$src, $dst|$dst, $src}",
1767 [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
1768 XS, Requires<[HasSSE2]>;
1769 def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
1770 "movdqu\t{$src, $dst|$dst, $src}",
1771 [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
1772 XS, Requires<[HasSSE2]>;
1774 let Constraints = "$src1 = $dst" in {
1776 multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
1777 bit Commutable = 0> {
1778 def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1779 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
1780 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
1781 let isCommutable = Commutable;
1783 def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
1784 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
1785 [(set VR128:$dst, (IntId VR128:$src1,
1786 (bitconvert (memopv2i64 addr:$src2))))]>;
1789 multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
1791 Intrinsic IntId, Intrinsic IntId2> {
1792 def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1793 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
1794 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
1795 def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
1796 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
1797 [(set VR128:$dst, (IntId VR128:$src1,
1798 (bitconvert (memopv2i64 addr:$src2))))]>;
1799 def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
1800 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
1801 [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
1804 /// PDI_binop_rm - Simple SSE2 binary operator.
1805 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
1806 ValueType OpVT, bit Commutable = 0> {
1807 def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1808 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
1809 [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
1810 let isCommutable = Commutable;
1812 def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
1813 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
1814 [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
1815 (bitconvert (memopv2i64 addr:$src2)))))]>;
1818 /// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
1820 /// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
1821 /// to collapse (bitconvert VT to VT) into its operand.
1823 multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
1824 bit Commutable = 0> {
1825 def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1826 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
1827 [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
1828 let isCommutable = Commutable;
1830 def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
1831 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
1832 [(set VR128:$dst, (OpNode VR128:$src1,(memopv2i64 addr:$src2)))]>;
1835 } // Constraints = "$src1 = $dst"
1837 // 128-bit Integer Arithmetic
1839 defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
1840 defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
1841 defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
1842 defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
1844 defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
1845 defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
1846 defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
1847 defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
1849 defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
1850 defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
1851 defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
1852 defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
1854 defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
1855 defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
1856 defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
1857 defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
1859 defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
1861 defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
1862 defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>;
1863 defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
1865 defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
1867 defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
1868 defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
1871 defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
1872 defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
1873 defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
1874 defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
1875 defm PSADBW : PDI_binop_rm_int<0xE0, "psadbw", int_x86_sse2_psad_bw, 1>;
1878 defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
1879 int_x86_sse2_psll_w, int_x86_sse2_pslli_w>;
1880 defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
1881 int_x86_sse2_psll_d, int_x86_sse2_pslli_d>;
1882 defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
1883 int_x86_sse2_psll_q, int_x86_sse2_pslli_q>;
1885 defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
1886 int_x86_sse2_psrl_w, int_x86_sse2_psrli_w>;
1887 defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
1888 int_x86_sse2_psrl_d, int_x86_sse2_psrli_d>;
1889 defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x72, MRM2r, "psrlq",
1890 int_x86_sse2_psrl_q, int_x86_sse2_psrli_q>;
1892 defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
1893 int_x86_sse2_psra_w, int_x86_sse2_psrai_w>;
1894 defm PSRAD : PDI_binop_rmi_int<0xE2, 0x71, MRM4r, "psrad",
1895 int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
1897 // 128-bit logical shifts.
1898 let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
1899 def PSLLDQri : PDIi8<0x73, MRM7r,
1900 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
1901 "pslldq\t{$src2, $dst|$dst, $src2}", []>;
1902 def PSRLDQri : PDIi8<0x73, MRM3r,
1903 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
1904 "psrldq\t{$src2, $dst|$dst, $src2}", []>;
1905 // PSRADQri doesn't exist in SSE[1-3].
1908 let Predicates = [HasSSE2] in {
1909 def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
1910 (v2i64 (PSLLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
1911 def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
1912 (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
1913 def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
1914 (v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
1918 defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
1919 defm POR : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
1920 defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
1922 let Constraints = "$src1 = $dst" in {
1923 def PANDNrr : PDI<0xDF, MRMSrcReg,
1924 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1925 "pandn\t{$src2, $dst|$dst, $src2}",
1926 [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
1929 def PANDNrm : PDI<0xDF, MRMSrcMem,
1930 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
1931 "pandn\t{$src2, $dst|$dst, $src2}",
1932 [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
1933 (memopv2i64 addr:$src2))))]>;
1936 // SSE2 Integer comparison
1937 defm PCMPEQB : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>;
1938 defm PCMPEQW : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>;
1939 defm PCMPEQD : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>;
1940 defm PCMPGTB : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
1941 defm PCMPGTW : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
1942 defm PCMPGTD : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
1944 // Pack instructions
1945 defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
1946 defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
1947 defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
1949 // Shuffle and unpack instructions
1950 def PSHUFDri : PDIi8<0x70, MRMSrcReg,
1951 (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
1952 "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1953 [(set VR128:$dst, (v4i32 (vector_shuffle
1954 VR128:$src1, (undef),
1955 PSHUFD_shuffle_mask:$src2)))]>;
1956 def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
1957 (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
1958 "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1959 [(set VR128:$dst, (v4i32 (vector_shuffle
1960 (bc_v4i32(memopv2i64 addr:$src1)),
1962 PSHUFD_shuffle_mask:$src2)))]>;
1964 // SSE2 with ImmT == Imm8 and XS prefix.
1965 def PSHUFHWri : Ii8<0x70, MRMSrcReg,
1966 (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
1967 "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1968 [(set VR128:$dst, (v8i16 (vector_shuffle
1969 VR128:$src1, (undef),
1970 PSHUFHW_shuffle_mask:$src2)))]>,
1971 XS, Requires<[HasSSE2]>;
1972 def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
1973 (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
1974 "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1975 [(set VR128:$dst, (v8i16 (vector_shuffle
1976 (bc_v8i16 (memopv2i64 addr:$src1)),
1978 PSHUFHW_shuffle_mask:$src2)))]>,
1979 XS, Requires<[HasSSE2]>;
1981 // SSE2 with ImmT == Imm8 and XD prefix.
1982 def PSHUFLWri : Ii8<0x70, MRMSrcReg,
1983 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
1984 "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1985 [(set VR128:$dst, (v8i16 (vector_shuffle
1986 VR128:$src1, (undef),
1987 PSHUFLW_shuffle_mask:$src2)))]>,
1988 XD, Requires<[HasSSE2]>;
1989 def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
1990 (outs VR128:$dst), (ins i128mem:$src1, i32i8imm:$src2),
1991 "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1992 [(set VR128:$dst, (v8i16 (vector_shuffle
1993 (bc_v8i16 (memopv2i64 addr:$src1)),
1995 PSHUFLW_shuffle_mask:$src2)))]>,
1996 XD, Requires<[HasSSE2]>;
1999 let Constraints = "$src1 = $dst" in {
2000 def PUNPCKLBWrr : PDI<0x60, MRMSrcReg,
2001 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2002 "punpcklbw\t{$src2, $dst|$dst, $src2}",
2004 (v16i8 (vector_shuffle VR128:$src1, VR128:$src2,
2005 UNPCKL_shuffle_mask)))]>;
2006 def PUNPCKLBWrm : PDI<0x60, MRMSrcMem,
2007 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2008 "punpcklbw\t{$src2, $dst|$dst, $src2}",
2010 (v16i8 (vector_shuffle VR128:$src1,
2011 (bc_v16i8 (memopv2i64 addr:$src2)),
2012 UNPCKL_shuffle_mask)))]>;
2013 def PUNPCKLWDrr : PDI<0x61, MRMSrcReg,
2014 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2015 "punpcklwd\t{$src2, $dst|$dst, $src2}",
2017 (v8i16 (vector_shuffle VR128:$src1, VR128:$src2,
2018 UNPCKL_shuffle_mask)))]>;
2019 def PUNPCKLWDrm : PDI<0x61, MRMSrcMem,
2020 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2021 "punpcklwd\t{$src2, $dst|$dst, $src2}",
2023 (v8i16 (vector_shuffle VR128:$src1,
2024 (bc_v8i16 (memopv2i64 addr:$src2)),
2025 UNPCKL_shuffle_mask)))]>;
2026 def PUNPCKLDQrr : PDI<0x62, MRMSrcReg,
2027 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2028 "punpckldq\t{$src2, $dst|$dst, $src2}",
2030 (v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
2031 UNPCKL_shuffle_mask)))]>;
2032 def PUNPCKLDQrm : PDI<0x62, MRMSrcMem,
2033 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2034 "punpckldq\t{$src2, $dst|$dst, $src2}",
2036 (v4i32 (vector_shuffle VR128:$src1,
2037 (bc_v4i32 (memopv2i64 addr:$src2)),
2038 UNPCKL_shuffle_mask)))]>;
2039 def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
2040 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2041 "punpcklqdq\t{$src2, $dst|$dst, $src2}",
2043 (v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
2044 UNPCKL_shuffle_mask)))]>;
2045 def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
2046 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2047 "punpcklqdq\t{$src2, $dst|$dst, $src2}",
2049 (v2i64 (vector_shuffle VR128:$src1,
2050 (memopv2i64 addr:$src2),
2051 UNPCKL_shuffle_mask)))]>;
2053 def PUNPCKHBWrr : PDI<0x68, MRMSrcReg,
2054 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2055 "punpckhbw\t{$src2, $dst|$dst, $src2}",
2057 (v16i8 (vector_shuffle VR128:$src1, VR128:$src2,
2058 UNPCKH_shuffle_mask)))]>;
2059 def PUNPCKHBWrm : PDI<0x68, MRMSrcMem,
2060 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2061 "punpckhbw\t{$src2, $dst|$dst, $src2}",
2063 (v16i8 (vector_shuffle VR128:$src1,
2064 (bc_v16i8 (memopv2i64 addr:$src2)),
2065 UNPCKH_shuffle_mask)))]>;
2066 def PUNPCKHWDrr : PDI<0x69, MRMSrcReg,
2067 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2068 "punpckhwd\t{$src2, $dst|$dst, $src2}",
2070 (v8i16 (vector_shuffle VR128:$src1, VR128:$src2,
2071 UNPCKH_shuffle_mask)))]>;
2072 def PUNPCKHWDrm : PDI<0x69, MRMSrcMem,
2073 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2074 "punpckhwd\t{$src2, $dst|$dst, $src2}",
2076 (v8i16 (vector_shuffle VR128:$src1,
2077 (bc_v8i16 (memopv2i64 addr:$src2)),
2078 UNPCKH_shuffle_mask)))]>;
2079 def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg,
2080 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2081 "punpckhdq\t{$src2, $dst|$dst, $src2}",
2083 (v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
2084 UNPCKH_shuffle_mask)))]>;
2085 def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem,
2086 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2087 "punpckhdq\t{$src2, $dst|$dst, $src2}",
2089 (v4i32 (vector_shuffle VR128:$src1,
2090 (bc_v4i32 (memopv2i64 addr:$src2)),
2091 UNPCKH_shuffle_mask)))]>;
2092 def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
2093 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2094 "punpckhqdq\t{$src2, $dst|$dst, $src2}",
2096 (v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
2097 UNPCKH_shuffle_mask)))]>;
2098 def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
2099 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2100 "punpckhqdq\t{$src2, $dst|$dst, $src2}",
2102 (v2i64 (vector_shuffle VR128:$src1,
2103 (memopv2i64 addr:$src2),
2104 UNPCKH_shuffle_mask)))]>;
2108 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
2109 (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
2110 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2111 [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
2113 let Constraints = "$src1 = $dst" in {
2114 def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
2115 (outs VR128:$dst), (ins VR128:$src1,
2116 GR32:$src2, i32i8imm:$src3),
2117 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2119 (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
2120 def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
2121 (outs VR128:$dst), (ins VR128:$src1,
2122 i16mem:$src2, i32i8imm:$src3),
2123 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2125 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
2130 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
2131 "pmovmskb\t{$src, $dst|$dst, $src}",
2132 [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
2134 // Conditional store
2136 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
2137 "maskmovdqu\t{$mask, $src|$src, $mask}",
2138 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
2140 // Non-temporal stores
2141 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
2142 "movntpd\t{$src, $dst|$dst, $src}",
2143 [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
2144 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
2145 "movntdq\t{$src, $dst|$dst, $src}",
2146 [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
2147 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
2148 "movnti\t{$src, $dst|$dst, $src}",
2149 [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
2150 TB, Requires<[HasSSE2]>;
2153 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
2154 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
2155 TB, Requires<[HasSSE2]>;
2157 // Load, store, and memory fence
2158 def LFENCE : I<0xAE, MRM5m, (outs), (ins),
2159 "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
2160 def MFENCE : I<0xAE, MRM6m, (outs), (ins),
2161 "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
2163 //TODO: custom lower this so as to never even generate the noop
2164 def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
2166 def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
2167 def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
2168 def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
2171 // Alias instructions that map zero vector to pxor / xorp* for sse.
2172 let isReMaterializable = 1 in
2173 def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins),
2174 "pcmpeqd\t$dst, $dst",
2175 [(set VR128:$dst, (v4i32 immAllOnesV))]>;
2177 // FR64 to 128-bit vector conversion.
2178 def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
2179 "movsd\t{$src, $dst|$dst, $src}",
2181 (v2f64 (scalar_to_vector FR64:$src)))]>;
2182 def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2183 "movsd\t{$src, $dst|$dst, $src}",
2185 (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>;
2187 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
2188 "movd\t{$src, $dst|$dst, $src}",
2190 (v4i32 (scalar_to_vector GR32:$src)))]>;
2191 def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
2192 "movd\t{$src, $dst|$dst, $src}",
2194 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
2196 def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
2197 "movd\t{$src, $dst|$dst, $src}",
2198 [(set FR32:$dst, (bitconvert GR32:$src))]>;
2200 def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
2201 "movd\t{$src, $dst|$dst, $src}",
2202 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
2204 // SSE2 instructions with XS prefix
2205 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2206 "movq\t{$src, $dst|$dst, $src}",
2208 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
2209 Requires<[HasSSE2]>;
2210 def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
2211 "movq\t{$src, $dst|$dst, $src}",
2212 [(store (i64 (vector_extract (v2i64 VR128:$src),
2213 (iPTR 0))), addr:$dst)]>;
2215 // FIXME: may not be able to eliminate this movss with coalescing the src and
2216 // dest register classes are different. We really want to write this pattern
2218 // def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
2219 // (f32 FR32:$src)>;
2220 def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins VR128:$src),
2221 "movsd\t{$src, $dst|$dst, $src}",
2222 [(set FR64:$dst, (vector_extract (v2f64 VR128:$src),
2224 def MOVPD2SDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
2225 "movsd\t{$src, $dst|$dst, $src}",
2226 [(store (f64 (vector_extract (v2f64 VR128:$src),
2227 (iPTR 0))), addr:$dst)]>;
2228 def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
2229 "movd\t{$src, $dst|$dst, $src}",
2230 [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
2232 def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
2233 "movd\t{$src, $dst|$dst, $src}",
2234 [(store (i32 (vector_extract (v4i32 VR128:$src),
2235 (iPTR 0))), addr:$dst)]>;
2237 def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
2238 "movd\t{$src, $dst|$dst, $src}",
2239 [(set GR32:$dst, (bitconvert FR32:$src))]>;
2240 def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
2241 "movd\t{$src, $dst|$dst, $src}",
2242 [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
2245 // Move to lower bits of a VR128, leaving upper bits alone.
2246 // Three operand (but two address) aliases.
2247 let Constraints = "$src1 = $dst" in {
2248 let neverHasSideEffects = 1 in
2249 def MOVLSD2PDrr : SDI<0x10, MRMSrcReg,
2250 (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
2251 "movsd\t{$src2, $dst|$dst, $src2}", []>;
2253 let AddedComplexity = 15 in
2254 def MOVLPDrr : SDI<0x10, MRMSrcReg,
2255 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2256 "movsd\t{$src2, $dst|$dst, $src2}",
2258 (v2f64 (vector_shuffle VR128:$src1, VR128:$src2,
2259 MOVL_shuffle_mask)))]>;
2262 // Store / copy lower 64-bits of a XMM register.
2263 def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
2264 "movq\t{$src, $dst|$dst, $src}",
2265 [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
2267 // Move to lower bits of a VR128 and zeroing upper bits.
2268 // Loading from memory automatically zeroing upper bits.
2269 let AddedComplexity = 20 in {
2270 def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2271 "movsd\t{$src, $dst|$dst, $src}",
2273 (v2f64 (X86vzmovl (v2f64 (scalar_to_vector
2274 (loadf64 addr:$src))))))]>;
2276 def : Pat<(v2f64 (X86vzmovl (memopv2f64 addr:$src))),
2277 (MOVZSD2PDrm addr:$src)>;
2278 def : Pat<(v2f64 (X86vzload addr:$src)), (MOVZSD2PDrm addr:$src)>;
2281 // movd / movq to XMM register zero-extends
2282 let AddedComplexity = 15 in {
2283 def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
2284 "movd\t{$src, $dst|$dst, $src}",
2285 [(set VR128:$dst, (v4i32 (X86vzmovl
2286 (v4i32 (scalar_to_vector GR32:$src)))))]>;
2287 // This is X86-64 only.
2288 def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
2289 "mov{d|q}\t{$src, $dst|$dst, $src}",
2290 [(set VR128:$dst, (v2i64 (X86vzmovl
2291 (v2i64 (scalar_to_vector GR64:$src)))))]>;
2294 let AddedComplexity = 20 in {
2295 def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
2296 "movd\t{$src, $dst|$dst, $src}",
2298 (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
2299 (loadi32 addr:$src))))))]>;
2300 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2301 "movq\t{$src, $dst|$dst, $src}",
2303 (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
2304 (loadi64 addr:$src))))))]>, XS,
2305 Requires<[HasSSE2]>;
2307 def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
2310 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
2311 // IA32 document. movq xmm1, xmm2 does clear the high bits.
2312 let AddedComplexity = 15 in
2313 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2314 "movq\t{$src, $dst|$dst, $src}",
2315 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
2316 XS, Requires<[HasSSE2]>;
2318 let AddedComplexity = 20 in
2319 def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
2320 "movq\t{$src, $dst|$dst, $src}",
2321 [(set VR128:$dst, (v2i64 (X86vzmovl
2322 (memopv2i64 addr:$src))))]>,
2323 XS, Requires<[HasSSE2]>;
2325 //===----------------------------------------------------------------------===//
2326 // SSE3 Instructions
2327 //===----------------------------------------------------------------------===//
2329 // Move Instructions
2330 def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2331 "movshdup\t{$src, $dst|$dst, $src}",
2332 [(set VR128:$dst, (v4f32 (vector_shuffle
2333 VR128:$src, (undef),
2334 MOVSHDUP_shuffle_mask)))]>;
2335 def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2336 "movshdup\t{$src, $dst|$dst, $src}",
2337 [(set VR128:$dst, (v4f32 (vector_shuffle
2338 (memopv4f32 addr:$src), (undef),
2339 MOVSHDUP_shuffle_mask)))]>;
2341 def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2342 "movsldup\t{$src, $dst|$dst, $src}",
2343 [(set VR128:$dst, (v4f32 (vector_shuffle
2344 VR128:$src, (undef),
2345 MOVSLDUP_shuffle_mask)))]>;
2346 def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2347 "movsldup\t{$src, $dst|$dst, $src}",
2348 [(set VR128:$dst, (v4f32 (vector_shuffle
2349 (memopv4f32 addr:$src), (undef),
2350 MOVSLDUP_shuffle_mask)))]>;
2352 def MOVDDUPrr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2353 "movddup\t{$src, $dst|$dst, $src}",
2354 [(set VR128:$dst, (v2f64 (vector_shuffle
2355 VR128:$src, (undef),
2356 SSE_splat_lo_mask)))]>;
2357 def MOVDDUPrm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2358 "movddup\t{$src, $dst|$dst, $src}",
2360 (v2f64 (vector_shuffle
2361 (scalar_to_vector (loadf64 addr:$src)),
2363 SSE_splat_lo_mask)))]>;
2366 let Constraints = "$src1 = $dst" in {
2367 def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg,
2368 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2369 "addsubps\t{$src2, $dst|$dst, $src2}",
2370 [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
2372 def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem,
2373 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
2374 "addsubps\t{$src2, $dst|$dst, $src2}",
2375 [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
2376 (load addr:$src2)))]>;
2377 def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
2378 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2379 "addsubpd\t{$src2, $dst|$dst, $src2}",
2380 [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
2382 def ADDSUBPDrm : S3I<0xD0, MRMSrcMem,
2383 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
2384 "addsubpd\t{$src2, $dst|$dst, $src2}",
2385 [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
2386 (load addr:$src2)))]>;
2389 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
2390 "lddqu\t{$src, $dst|$dst, $src}",
2391 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
2394 class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
2395 : S3DI<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2396 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2397 [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
2398 class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
2399 : S3DI<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
2400 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2401 [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (load addr:$src2))))]>;
2402 class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
2403 : S3I<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2404 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2405 [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
2406 class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
2407 : S3I<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
2408 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2409 [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (load addr:$src2))))]>;
2411 let Constraints = "$src1 = $dst" in {
2412 def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
2413 def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
2414 def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
2415 def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
2416 def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
2417 def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
2418 def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
2419 def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
2422 // Thread synchronization
2423 def MONITOR : I<0xC8, RawFrm, (outs), (ins), "monitor",
2424 [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
2425 def MWAIT : I<0xC9, RawFrm, (outs), (ins), "mwait",
2426 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
2428 // vector_shuffle v1, <undef> <1, 1, 3, 3>
2429 let AddedComplexity = 15 in
2430 def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
2431 MOVSHDUP_shuffle_mask)),
2432 (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
2433 let AddedComplexity = 20 in
2434 def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (memopv2i64 addr:$src)), (undef),
2435 MOVSHDUP_shuffle_mask)),
2436 (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
2438 // vector_shuffle v1, <undef> <0, 0, 2, 2>
2439 let AddedComplexity = 15 in
2440 def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
2441 MOVSLDUP_shuffle_mask)),
2442 (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
2443 let AddedComplexity = 20 in
2444 def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (memopv2i64 addr:$src)), (undef),
2445 MOVSLDUP_shuffle_mask)),
2446 (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
2448 //===----------------------------------------------------------------------===//
2449 // SSSE3 Instructions
2450 //===----------------------------------------------------------------------===//
2452 /// SS3I_unop_rm_int_8 - Simple SSSE3 unary operator whose type is v*i8.
2453 multiclass SS3I_unop_rm_int_8<bits<8> opc, string OpcodeStr,
2454 Intrinsic IntId64, Intrinsic IntId128> {
2455 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
2456 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2457 [(set VR64:$dst, (IntId64 VR64:$src))]>;
2459 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
2460 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2462 (IntId64 (bitconvert (memopv8i8 addr:$src))))]>;
2464 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2466 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2467 [(set VR128:$dst, (IntId128 VR128:$src))]>,
2470 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2472 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2475 (bitconvert (memopv16i8 addr:$src))))]>, OpSize;
2478 /// SS3I_unop_rm_int_16 - Simple SSSE3 unary operator whose type is v*i16.
2479 multiclass SS3I_unop_rm_int_16<bits<8> opc, string OpcodeStr,
2480 Intrinsic IntId64, Intrinsic IntId128> {
2481 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
2483 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2484 [(set VR64:$dst, (IntId64 VR64:$src))]>;
2486 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
2488 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2491 (bitconvert (memopv4i16 addr:$src))))]>;
2493 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2495 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2496 [(set VR128:$dst, (IntId128 VR128:$src))]>,
2499 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2501 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2504 (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
2507 /// SS3I_unop_rm_int_32 - Simple SSSE3 unary operator whose type is v*i32.
2508 multiclass SS3I_unop_rm_int_32<bits<8> opc, string OpcodeStr,
2509 Intrinsic IntId64, Intrinsic IntId128> {
2510 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
2512 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2513 [(set VR64:$dst, (IntId64 VR64:$src))]>;
2515 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
2517 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2520 (bitconvert (memopv2i32 addr:$src))))]>;
2522 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2524 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2525 [(set VR128:$dst, (IntId128 VR128:$src))]>,
2528 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2530 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2533 (bitconvert (memopv4i32 addr:$src))))]>, OpSize;
2536 defm PABSB : SS3I_unop_rm_int_8 <0x1C, "pabsb",
2537 int_x86_ssse3_pabs_b,
2538 int_x86_ssse3_pabs_b_128>;
2539 defm PABSW : SS3I_unop_rm_int_16<0x1D, "pabsw",
2540 int_x86_ssse3_pabs_w,
2541 int_x86_ssse3_pabs_w_128>;
2542 defm PABSD : SS3I_unop_rm_int_32<0x1E, "pabsd",
2543 int_x86_ssse3_pabs_d,
2544 int_x86_ssse3_pabs_d_128>;
2546 /// SS3I_binop_rm_int_8 - Simple SSSE3 binary operator whose type is v*i8.
2547 let Constraints = "$src1 = $dst" in {
2548 multiclass SS3I_binop_rm_int_8<bits<8> opc, string OpcodeStr,
2549 Intrinsic IntId64, Intrinsic IntId128,
2550 bit Commutable = 0> {
2551 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
2552 (ins VR64:$src1, VR64:$src2),
2553 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2554 [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
2555 let isCommutable = Commutable;
2557 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
2558 (ins VR64:$src1, i64mem:$src2),
2559 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2561 (IntId64 VR64:$src1,
2562 (bitconvert (memopv8i8 addr:$src2))))]>;
2564 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2565 (ins VR128:$src1, VR128:$src2),
2566 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2567 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
2569 let isCommutable = Commutable;
2571 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2572 (ins VR128:$src1, i128mem:$src2),
2573 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2575 (IntId128 VR128:$src1,
2576 (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
2580 /// SS3I_binop_rm_int_16 - Simple SSSE3 binary operator whose type is v*i16.
2581 let Constraints = "$src1 = $dst" in {
2582 multiclass SS3I_binop_rm_int_16<bits<8> opc, string OpcodeStr,
2583 Intrinsic IntId64, Intrinsic IntId128,
2584 bit Commutable = 0> {
2585 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
2586 (ins VR64:$src1, VR64:$src2),
2587 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2588 [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
2589 let isCommutable = Commutable;
2591 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
2592 (ins VR64:$src1, i64mem:$src2),
2593 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2595 (IntId64 VR64:$src1,
2596 (bitconvert (memopv4i16 addr:$src2))))]>;
2598 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2599 (ins VR128:$src1, VR128:$src2),
2600 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2601 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
2603 let isCommutable = Commutable;
2605 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2606 (ins VR128:$src1, i128mem:$src2),
2607 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2609 (IntId128 VR128:$src1,
2610 (bitconvert (memopv8i16 addr:$src2))))]>, OpSize;
2614 /// SS3I_binop_rm_int_32 - Simple SSSE3 binary operator whose type is v*i32.
2615 let Constraints = "$src1 = $dst" in {
2616 multiclass SS3I_binop_rm_int_32<bits<8> opc, string OpcodeStr,
2617 Intrinsic IntId64, Intrinsic IntId128,
2618 bit Commutable = 0> {
2619 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
2620 (ins VR64:$src1, VR64:$src2),
2621 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2622 [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
2623 let isCommutable = Commutable;
2625 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
2626 (ins VR64:$src1, i64mem:$src2),
2627 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2629 (IntId64 VR64:$src1,
2630 (bitconvert (memopv2i32 addr:$src2))))]>;
2632 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2633 (ins VR128:$src1, VR128:$src2),
2634 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2635 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
2637 let isCommutable = Commutable;
2639 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2640 (ins VR128:$src1, i128mem:$src2),
2641 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2643 (IntId128 VR128:$src1,
2644 (bitconvert (memopv4i32 addr:$src2))))]>, OpSize;
2648 defm PHADDW : SS3I_binop_rm_int_16<0x01, "phaddw",
2649 int_x86_ssse3_phadd_w,
2650 int_x86_ssse3_phadd_w_128, 1>;
2651 defm PHADDD : SS3I_binop_rm_int_32<0x02, "phaddd",
2652 int_x86_ssse3_phadd_d,
2653 int_x86_ssse3_phadd_d_128, 1>;
2654 defm PHADDSW : SS3I_binop_rm_int_16<0x03, "phaddsw",
2655 int_x86_ssse3_phadd_sw,
2656 int_x86_ssse3_phadd_sw_128, 1>;
2657 defm PHSUBW : SS3I_binop_rm_int_16<0x05, "phsubw",
2658 int_x86_ssse3_phsub_w,
2659 int_x86_ssse3_phsub_w_128>;
2660 defm PHSUBD : SS3I_binop_rm_int_32<0x06, "phsubd",
2661 int_x86_ssse3_phsub_d,
2662 int_x86_ssse3_phsub_d_128>;
2663 defm PHSUBSW : SS3I_binop_rm_int_16<0x07, "phsubsw",
2664 int_x86_ssse3_phsub_sw,
2665 int_x86_ssse3_phsub_sw_128>;
2666 defm PMADDUBSW : SS3I_binop_rm_int_8 <0x04, "pmaddubsw",
2667 int_x86_ssse3_pmadd_ub_sw,
2668 int_x86_ssse3_pmadd_ub_sw_128, 1>;
2669 defm PMULHRSW : SS3I_binop_rm_int_16<0x0B, "pmulhrsw",
2670 int_x86_ssse3_pmul_hr_sw,
2671 int_x86_ssse3_pmul_hr_sw_128, 1>;
2672 defm PSHUFB : SS3I_binop_rm_int_8 <0x00, "pshufb",
2673 int_x86_ssse3_pshuf_b,
2674 int_x86_ssse3_pshuf_b_128>;
2675 defm PSIGNB : SS3I_binop_rm_int_8 <0x08, "psignb",
2676 int_x86_ssse3_psign_b,
2677 int_x86_ssse3_psign_b_128>;
2678 defm PSIGNW : SS3I_binop_rm_int_16<0x09, "psignw",
2679 int_x86_ssse3_psign_w,
2680 int_x86_ssse3_psign_w_128>;
2681 defm PSIGND : SS3I_binop_rm_int_32<0x09, "psignd",
2682 int_x86_ssse3_psign_d,
2683 int_x86_ssse3_psign_d_128>;
2685 let Constraints = "$src1 = $dst" in {
2686 def PALIGNR64rr : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
2687 (ins VR64:$src1, VR64:$src2, i16imm:$src3),
2688 "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2690 (int_x86_ssse3_palign_r
2691 VR64:$src1, VR64:$src2,
2693 def PALIGNR64rm : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
2694 (ins VR64:$src1, i64mem:$src2, i16imm:$src3),
2695 "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2697 (int_x86_ssse3_palign_r
2699 (bitconvert (memopv2i32 addr:$src2)),
2702 def PALIGNR128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
2703 (ins VR128:$src1, VR128:$src2, i32imm:$src3),
2704 "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2706 (int_x86_ssse3_palign_r_128
2707 VR128:$src1, VR128:$src2,
2708 imm:$src3))]>, OpSize;
2709 def PALIGNR128rm : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
2710 (ins VR128:$src1, i128mem:$src2, i32imm:$src3),
2711 "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2713 (int_x86_ssse3_palign_r_128
2715 (bitconvert (memopv4i32 addr:$src2)),
2716 imm:$src3))]>, OpSize;
2719 //===----------------------------------------------------------------------===//
2720 // Non-Instruction Patterns
2721 //===----------------------------------------------------------------------===//
2723 // extload f32 -> f64. This matches load+fextend because we have a hack in
2724 // the isel (PreprocessForFPConvert) that can introduce loads after dag combine.
2725 // Since these loads aren't folded into the fextend, we have to match it
2727 let Predicates = [HasSSE2] in
2728 def : Pat<(fextend (loadf32 addr:$src)),
2729 (CVTSS2SDrm addr:$src)>;
2732 let Predicates = [HasSSE2] in {
2733 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
2734 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
2735 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
2736 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
2737 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
2738 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
2739 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
2740 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
2741 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
2742 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
2743 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
2744 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
2745 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
2746 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
2747 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
2748 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
2749 def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
2750 def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
2751 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
2752 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
2753 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
2754 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
2755 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
2756 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
2757 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
2758 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
2759 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
2760 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
2761 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
2762 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
2765 // Move scalar to XMM zero-extended
2766 // movd to XMM register zero-extends
2767 let AddedComplexity = 15 in {
2768 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
2769 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
2770 (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
2771 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
2772 (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
2773 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
2774 (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE2]>;
2777 // Splat v2f64 / v2i64
2778 let AddedComplexity = 10 in {
2779 def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), SSE_splat_lo_mask:$sm),
2780 (UNPCKLPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
2781 def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm),
2782 (UNPCKHPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
2783 def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), SSE_splat_lo_mask:$sm),
2784 (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
2785 def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm),
2786 (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
2789 // Special unary SHUFPSrri case.
2790 def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef),
2791 SHUFP_unary_shuffle_mask:$sm)),
2792 (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
2793 Requires<[HasSSE1]>;
2794 // Special unary SHUFPDrri case.
2795 def : Pat<(v2f64 (vector_shuffle VR128:$src1, (undef),
2796 SHUFP_unary_shuffle_mask:$sm)),
2797 (SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
2798 Requires<[HasSSE2]>;
2799 // Unary v4f32 shuffle with PSHUF* in order to fold a load.
2800 def : Pat<(vector_shuffle (bc_v4i32 (memopv4f32 addr:$src1)), (undef),
2801 SHUFP_unary_shuffle_mask:$sm),
2802 (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>,
2803 Requires<[HasSSE2]>;
2804 // Special binary v4i32 shuffle cases with SHUFPS.
2805 def : Pat<(v4i32 (vector_shuffle VR128:$src1, (v4i32 VR128:$src2),
2806 PSHUFD_binary_shuffle_mask:$sm)),
2807 (SHUFPSrri VR128:$src1, VR128:$src2, PSHUFD_binary_shuffle_mask:$sm)>,
2808 Requires<[HasSSE2]>;
2809 def : Pat<(v4i32 (vector_shuffle VR128:$src1,
2810 (bc_v4i32 (memopv2i64 addr:$src2)), PSHUFD_binary_shuffle_mask:$sm)),
2811 (SHUFPSrmi VR128:$src1, addr:$src2, PSHUFD_binary_shuffle_mask:$sm)>,
2812 Requires<[HasSSE2]>;
2813 // Special binary v2i64 shuffle cases using SHUFPDrri.
2814 def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
2815 SHUFP_shuffle_mask:$sm)),
2816 (SHUFPDrri VR128:$src1, VR128:$src2, SHUFP_shuffle_mask:$sm)>,
2817 Requires<[HasSSE2]>;
2818 // Special unary SHUFPDrri case.
2819 def : Pat<(v2i64 (vector_shuffle VR128:$src1, (undef),
2820 SHUFP_unary_shuffle_mask:$sm)),
2821 (SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
2822 Requires<[HasSSE2]>;
2824 // vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
2825 let AddedComplexity = 10 in {
2826 def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
2827 UNPCKL_v_undef_shuffle_mask)),
2828 (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
2829 def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef),
2830 UNPCKL_v_undef_shuffle_mask)),
2831 (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
2832 def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef),
2833 UNPCKL_v_undef_shuffle_mask)),
2834 (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
2835 def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
2836 UNPCKL_v_undef_shuffle_mask)),
2837 (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
2840 // vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
2841 let AddedComplexity = 10 in {
2842 def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
2843 UNPCKH_v_undef_shuffle_mask)),
2844 (UNPCKHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
2845 def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef),
2846 UNPCKH_v_undef_shuffle_mask)),
2847 (PUNPCKHBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
2848 def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef),
2849 UNPCKH_v_undef_shuffle_mask)),
2850 (PUNPCKHWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
2851 def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
2852 UNPCKH_v_undef_shuffle_mask)),
2853 (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
2856 let AddedComplexity = 15 in {
2857 // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
2858 def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
2859 MOVHP_shuffle_mask)),
2860 (MOVLHPSrr VR128:$src1, VR128:$src2)>;
2862 // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
2863 def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
2864 MOVHLPS_shuffle_mask)),
2865 (MOVHLPSrr VR128:$src1, VR128:$src2)>;
2867 // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
2868 def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef),
2869 MOVHLPS_v_undef_shuffle_mask)),
2870 (MOVHLPSrr VR128:$src1, VR128:$src1)>;
2871 def : Pat<(v4i32 (vector_shuffle VR128:$src1, (undef),
2872 MOVHLPS_v_undef_shuffle_mask)),
2873 (MOVHLPSrr VR128:$src1, VR128:$src1)>;
2876 let AddedComplexity = 20 in {
2877 // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
2878 // vector_shuffle v1, (load v2) <0, 1, 4, 5> using MOVHPS
2879 def : Pat<(v4f32 (vector_shuffle VR128:$src1, (memopv4f32 addr:$src2),
2880 MOVLP_shuffle_mask)),
2881 (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
2882 def : Pat<(v2f64 (vector_shuffle VR128:$src1, (memopv2f64 addr:$src2),
2883 MOVLP_shuffle_mask)),
2884 (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
2885 def : Pat<(v4f32 (vector_shuffle VR128:$src1, (memopv4f32 addr:$src2),
2886 MOVHP_shuffle_mask)),
2887 (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
2888 def : Pat<(v2f64 (vector_shuffle VR128:$src1, (memopv2f64 addr:$src2),
2889 MOVHP_shuffle_mask)),
2890 (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
2892 def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)),
2893 MOVLP_shuffle_mask)),
2894 (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
2895 def : Pat<(v2i64 (vector_shuffle VR128:$src1, (memopv2i64 addr:$src2),
2896 MOVLP_shuffle_mask)),
2897 (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
2898 def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)),
2899 MOVHP_shuffle_mask)),
2900 (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
2901 def : Pat<(v2i64 (vector_shuffle VR128:$src1, (memopv2i64 addr:$src2),
2902 MOVLP_shuffle_mask)),
2903 (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
2906 let AddedComplexity = 15 in {
2907 // Setting the lowest element in the vector.
2908 def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
2909 MOVL_shuffle_mask)),
2910 (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
2911 def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
2912 MOVL_shuffle_mask)),
2913 (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
2915 // vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd)
2916 def : Pat<(v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
2917 MOVLP_shuffle_mask)),
2918 (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
2919 def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
2920 MOVLP_shuffle_mask)),
2921 (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
2924 // Set lowest element and zero upper elements.
2925 let AddedComplexity = 15 in
2926 def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc, VR128:$src,
2927 MOVL_shuffle_mask)),
2928 (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
2929 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
2930 (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
2932 // FIXME: Temporary workaround since 2-wide shuffle is broken.
2933 def : Pat<(int_x86_sse2_movs_d VR128:$src1, VR128:$src2),
2934 (v2f64 (MOVLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
2935 def : Pat<(int_x86_sse2_loadh_pd VR128:$src1, addr:$src2),
2936 (v2f64 (MOVHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
2937 def : Pat<(int_x86_sse2_loadl_pd VR128:$src1, addr:$src2),
2938 (v2f64 (MOVLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
2939 def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, VR128:$src2, imm:$src3),
2940 (v2f64 (SHUFPDrri VR128:$src1, VR128:$src2, imm:$src3))>,
2941 Requires<[HasSSE2]>;
2942 def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, (load addr:$src2), imm:$src3),
2943 (v2f64 (SHUFPDrmi VR128:$src1, addr:$src2, imm:$src3))>,
2944 Requires<[HasSSE2]>;
2945 def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, VR128:$src2),
2946 (v2f64 (UNPCKHPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
2947 def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, (load addr:$src2)),
2948 (v2f64 (UNPCKHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
2949 def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, VR128:$src2),
2950 (v2f64 (UNPCKLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
2951 def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, (load addr:$src2)),
2952 (v2f64 (UNPCKLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
2953 def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, VR128:$src2),
2954 (v2i64 (PUNPCKHQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
2955 def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, (load addr:$src2)),
2956 (v2i64 (PUNPCKHQDQrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
2957 def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, VR128:$src2),
2958 (v2i64 (PUNPCKLQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
2959 def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, (load addr:$src2)),
2960 (PUNPCKLQDQrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
2962 // Some special case pandn patterns.
2963 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
2965 (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
2966 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
2968 (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
2969 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
2971 (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
2973 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
2974 (memopv2i64 addr:$src2))),
2975 (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
2976 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
2977 (memopv2i64 addr:$src2))),
2978 (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
2979 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
2980 (memopv2i64 addr:$src2))),
2981 (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
2983 // vector -> vector casts
2984 def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2985 (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;
2986 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2987 (Int_CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
2989 // Use movaps / movups for SSE integer load / store (one byte shorter).
2990 def : Pat<(alignedloadv4i32 addr:$src),
2991 (MOVAPSrm addr:$src)>, Requires<[HasSSE1]>;
2992 def : Pat<(loadv4i32 addr:$src),
2993 (MOVUPSrm addr:$src)>, Requires<[HasSSE1]>;
2994 def : Pat<(alignedloadv2i64 addr:$src),
2995 (MOVAPSrm addr:$src)>, Requires<[HasSSE2]>;
2996 def : Pat<(loadv2i64 addr:$src),
2997 (MOVUPSrm addr:$src)>, Requires<[HasSSE2]>;
2999 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
3000 (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
3001 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3002 (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
3003 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3004 (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
3005 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3006 (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
3007 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
3008 (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
3009 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3010 (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
3011 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3012 (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
3013 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3014 (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
3016 //===----------------------------------------------------------------------===//
3017 // SSE4.1 Instructions
3018 //===----------------------------------------------------------------------===//
3020 multiclass sse41_fp_unop_rm<bits<8> opcss, bits<8> opcps,
3021 bits<8> opcsd, bits<8> opcpd,
3026 Intrinsic V2F64Int> {
3027 // Intrinsic operation, reg.
3028 def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
3029 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3030 !strconcat(OpcodeStr,
3031 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3032 [(set VR128:$dst, (F32Int VR128:$src1, imm:$src2))]>,
3035 // Intrinsic operation, mem.
3036 def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
3037 (outs VR128:$dst), (ins ssmem:$src1, i32i8imm:$src2),
3038 !strconcat(OpcodeStr,
3039 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3040 [(set VR128:$dst, (F32Int sse_load_f32:$src1, imm:$src2))]>,
3043 // Vector intrinsic operation, reg
3044 def PSr_Int : SS4AIi8<opcps, MRMSrcReg,
3045 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3046 !strconcat(OpcodeStr,
3047 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3048 [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]>,
3051 // Vector intrinsic operation, mem
3052 def PSm_Int : SS4AIi8<opcps, MRMSrcMem,
3053 (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
3054 !strconcat(OpcodeStr,
3055 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3056 [(set VR128:$dst, (V4F32Int (load addr:$src1),imm:$src2))]>,
3059 // Intrinsic operation, reg.
3060 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
3061 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3062 !strconcat(OpcodeStr,
3063 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3064 [(set VR128:$dst, (F64Int VR128:$src1, imm:$src2))]>,
3067 // Intrinsic operation, mem.
3068 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
3069 (outs VR128:$dst), (ins sdmem:$src1, i32i8imm:$src2),
3070 !strconcat(OpcodeStr,
3071 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3072 [(set VR128:$dst, (F64Int sse_load_f64:$src1, imm:$src2))]>,
3075 // Vector intrinsic operation, reg
3076 def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
3077 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3078 !strconcat(OpcodeStr,
3079 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3080 [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]>,
3083 // Vector intrinsic operation, mem
3084 def PDm_Int : SS4AIi8<opcpd, MRMSrcMem,
3085 (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
3086 !strconcat(OpcodeStr,
3087 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3088 [(set VR128:$dst, (V2F64Int (load addr:$src1),imm:$src2))]>,
3092 // FP round - roundss, roundps, roundsd, roundpd
3093 defm ROUND : sse41_fp_unop_rm<0x0A, 0x08, 0x0B, 0x09, "round",
3094 int_x86_sse41_round_ss, int_x86_sse41_round_ps,
3095 int_x86_sse41_round_sd, int_x86_sse41_round_pd>;
3097 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
3098 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
3099 Intrinsic IntId128> {
3100 def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3102 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3103 [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
3104 def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3106 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3109 (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
3112 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
3113 int_x86_sse41_phminposuw>;
3115 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
3116 let Constraints = "$src1 = $dst" in {
3117 multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
3118 Intrinsic IntId128, bit Commutable = 0> {
3119 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3120 (ins VR128:$src1, VR128:$src2),
3121 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3122 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
3124 let isCommutable = Commutable;
3126 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3127 (ins VR128:$src1, i128mem:$src2),
3128 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3130 (IntId128 VR128:$src1,
3131 (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
3135 defm PCMPEQQ : SS41I_binop_rm_int<0x29, "pcmpeqq",
3136 int_x86_sse41_pcmpeqq, 1>;
3137 defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw",
3138 int_x86_sse41_packusdw, 0>;
3139 defm PMINSB : SS41I_binop_rm_int<0x38, "pminsb",
3140 int_x86_sse41_pminsb, 1>;
3141 defm PMINSD : SS41I_binop_rm_int<0x39, "pminsd",
3142 int_x86_sse41_pminsd, 1>;
3143 defm PMINUD : SS41I_binop_rm_int<0x3B, "pminud",
3144 int_x86_sse41_pminud, 1>;
3145 defm PMINUW : SS41I_binop_rm_int<0x3A, "pminuw",
3146 int_x86_sse41_pminuw, 1>;
3147 defm PMAXSB : SS41I_binop_rm_int<0x3C, "pmaxsb",
3148 int_x86_sse41_pmaxsb, 1>;
3149 defm PMAXSD : SS41I_binop_rm_int<0x3D, "pmaxsd",
3150 int_x86_sse41_pmaxsd, 1>;
3151 defm PMAXUD : SS41I_binop_rm_int<0x3F, "pmaxud",
3152 int_x86_sse41_pmaxud, 1>;
3153 defm PMAXUW : SS41I_binop_rm_int<0x3E, "pmaxuw",
3154 int_x86_sse41_pmaxuw, 1>;
3155 defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq",
3156 int_x86_sse41_pmuldq, 1>;
3159 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
3160 let Constraints = "$src1 = $dst" in {
3161 multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, SDNode OpNode,
3162 Intrinsic IntId128, bit Commutable = 0> {
3163 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3164 (ins VR128:$src1, VR128:$src2),
3165 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3166 [(set VR128:$dst, (OpNode (v4i32 VR128:$src1),
3167 VR128:$src2))]>, OpSize {
3168 let isCommutable = Commutable;
3170 def rr_int : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3171 (ins VR128:$src1, VR128:$src2),
3172 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3173 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
3175 let isCommutable = Commutable;
3177 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3178 (ins VR128:$src1, i128mem:$src2),
3179 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3181 (OpNode VR128:$src1, (memopv4i32 addr:$src2)))]>, OpSize;
3182 def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3183 (ins VR128:$src1, i128mem:$src2),
3184 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3186 (IntId128 VR128:$src1, (memopv4i32 addr:$src2)))]>,
3190 defm PMULLD : SS41I_binop_patint<0x40, "pmulld", mul,
3191 int_x86_sse41_pmulld, 1>;
3194 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
3195 let Constraints = "$src1 = $dst" in {
3196 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
3197 Intrinsic IntId128, bit Commutable = 0> {
3198 def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
3199 (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
3200 !strconcat(OpcodeStr,
3201 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3203 (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
3205 let isCommutable = Commutable;
3207 def rmi : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
3208 (ins VR128:$src1, i128mem:$src2, i32i8imm:$src3),
3209 !strconcat(OpcodeStr,
3210 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3212 (IntId128 VR128:$src1,
3213 (bitconvert (memopv16i8 addr:$src2)), imm:$src3))]>,
3218 defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps",
3219 int_x86_sse41_blendps, 0>;
3220 defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd",
3221 int_x86_sse41_blendpd, 0>;
3222 defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw",
3223 int_x86_sse41_pblendw, 0>;
3224 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps",
3225 int_x86_sse41_dpps, 1>;
3226 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd",
3227 int_x86_sse41_dppd, 1>;
3228 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw",
3229 int_x86_sse41_mpsadbw, 0>;
3232 /// SS41I_ternary_int - SSE 4.1 ternary operator
3233 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
3234 multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
3235 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3236 (ins VR128:$src1, VR128:$src2),
3237 !strconcat(OpcodeStr,
3238 "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
3239 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
3242 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3243 (ins VR128:$src1, i128mem:$src2),
3244 !strconcat(OpcodeStr,
3245 "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
3248 (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
3252 defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
3253 defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
3254 defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
3257 multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
3258 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3259 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3260 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
3262 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
3263 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3265 (IntId (bitconvert (v4i32 (load addr:$src)))))]>, OpSize;
3268 defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
3269 defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
3270 defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
3271 defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
3272 defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
3273 defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
3275 multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
3276 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3277 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3278 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
3280 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
3281 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3283 (IntId (bitconvert (v4i32 (load addr:$src)))))]>, OpSize;
3286 defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
3287 defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
3288 defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
3289 defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
3291 multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
3292 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3293 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3294 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
3296 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
3297 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3299 (IntId (bitconvert (v4i32 (load addr:$src)))))]>, OpSize;
3302 defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
3303 defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovsxbq", int_x86_sse41_pmovzxbq>;
3306 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
3307 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
3308 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
3309 (ins VR128:$src1, i32i8imm:$src2),
3310 !strconcat(OpcodeStr,
3311 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3312 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
3314 def mr : SS4AIi8<opc, MRMDestMem, (outs),
3315 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
3316 !strconcat(OpcodeStr,
3317 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3320 // There's an AssertZext in the way of writing the store pattern
3321 // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
3324 defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
3327 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
3328 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
3329 def mr : SS4AIi8<opc, MRMDestMem, (outs),
3330 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
3331 !strconcat(OpcodeStr,
3332 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3335 // There's an AssertZext in the way of writing the store pattern
3336 // (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
3339 defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
3342 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
3343 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
3344 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
3345 (ins VR128:$src1, i32i8imm:$src2),
3346 !strconcat(OpcodeStr,
3347 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3349 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
3350 def mr : SS4AIi8<opc, MRMDestMem, (outs),
3351 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
3352 !strconcat(OpcodeStr,
3353 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3354 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
3355 addr:$dst)]>, OpSize;
3358 defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
3361 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
3363 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
3364 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
3365 (ins VR128:$src1, i32i8imm:$src2),
3366 !strconcat(OpcodeStr,
3367 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3369 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
3371 def mr : SS4AIi8<opc, MRMDestMem, (outs),
3372 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
3373 !strconcat(OpcodeStr,
3374 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3375 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
3376 addr:$dst)]>, OpSize;
3379 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
3381 let Constraints = "$src1 = $dst" in {
3382 multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> {
3383 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
3384 (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
3385 !strconcat(OpcodeStr,
3386 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3388 (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
3389 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
3390 (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
3391 !strconcat(OpcodeStr,
3392 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3394 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
3395 imm:$src3))]>, OpSize;
3399 defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
3401 let Constraints = "$src1 = $dst" in {
3402 multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> {
3403 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
3404 (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
3405 !strconcat(OpcodeStr,
3406 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3408 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
3410 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
3411 (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
3412 !strconcat(OpcodeStr,
3413 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3415 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
3416 imm:$src3)))]>, OpSize;
3420 defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
3422 let Constraints = "$src1 = $dst" in {
3423 multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> {
3424 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
3425 (ins VR128:$src1, FR32:$src2, i32i8imm:$src3),
3426 !strconcat(OpcodeStr,
3427 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3429 (X86insrtps VR128:$src1, FR32:$src2, imm:$src3))]>, OpSize;
3430 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
3431 (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
3432 !strconcat(OpcodeStr,
3433 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3435 (X86insrtps VR128:$src1, (loadf32 addr:$src2),
3436 imm:$src3))]>, OpSize;
3440 defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
3442 let Defs = [EFLAGS] in {
3443 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
3444 "ptest \t{$src2, $src1|$src1, $src2}", []>, OpSize;
3445 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
3446 "ptest \t{$src2, $src1|$src1, $src2}", []>, OpSize;
3449 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3450 "movntdqa\t{$src, $dst|$dst, $src}",
3451 [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;