1 //====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file describes the X86 SSE instruction set, defining the instructions,
11 // and properties of the instructions which are needed for code generation,
12 // machine code emission, and analysis.
14 //===----------------------------------------------------------------------===//
17 //===----------------------------------------------------------------------===//
18 // SSE specific DAG Nodes.
19 //===----------------------------------------------------------------------===//
21 def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
22 SDTCisFP<0>, SDTCisInt<2> ]>;
23 def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
24 SDTCisFP<1>, SDTCisVT<3, i8>]>;
26 def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
27 def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
28 def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
29 [SDNPCommutative, SDNPAssociative]>;
30 def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
31 [SDNPCommutative, SDNPAssociative]>;
32 def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
33 [SDNPCommutative, SDNPAssociative]>;
34 def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
35 def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
36 def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>;
37 def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
38 def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
39 def X86pshufb : SDNode<"X86ISD::PSHUFB",
40 SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
42 def X86pextrb : SDNode<"X86ISD::PEXTRB",
43 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
44 def X86pextrw : SDNode<"X86ISD::PEXTRW",
45 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
46 def X86pinsrb : SDNode<"X86ISD::PINSRB",
47 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
48 SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
49 def X86pinsrw : SDNode<"X86ISD::PINSRW",
50 SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
51 SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
52 def X86insrtps : SDNode<"X86ISD::INSERTPS",
53 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
54 SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
55 def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
56 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
57 def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
58 [SDNPHasChain, SDNPMayLoad]>;
59 def X86vshl : SDNode<"X86ISD::VSHL", SDTIntShiftOp>;
60 def X86vshr : SDNode<"X86ISD::VSRL", SDTIntShiftOp>;
61 def X86cmpps : SDNode<"X86ISD::CMPPS", SDTX86VFCMP>;
62 def X86cmppd : SDNode<"X86ISD::CMPPD", SDTX86VFCMP>;
63 def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>;
64 def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>;
65 def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>;
66 def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>;
67 def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>;
68 def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
69 def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
70 def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
72 def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
75 def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
77 //===----------------------------------------------------------------------===//
78 // SSE Complex Patterns
79 //===----------------------------------------------------------------------===//
81 // These are 'extloads' from a scalar to the low element of a vector, zeroing
82 // the top elements. These are used for the SSE 'ss' and 'sd' instruction
84 def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
85 [SDNPHasChain, SDNPMayLoad]>;
86 def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
87 [SDNPHasChain, SDNPMayLoad]>;
89 def ssmem : Operand<v4f32> {
90 let PrintMethod = "printf32mem";
91 let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
92 let ParserMatchClass = X86MemAsmOperand;
94 def sdmem : Operand<v2f64> {
95 let PrintMethod = "printf64mem";
96 let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
97 let ParserMatchClass = X86MemAsmOperand;
100 //===----------------------------------------------------------------------===//
101 // SSE pattern fragments
102 //===----------------------------------------------------------------------===//
104 def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
105 def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
106 def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
107 def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
109 // Like 'store', but always requires vector alignment.
110 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
111 (store node:$val, node:$ptr), [{
112 return cast<StoreSDNode>(N)->getAlignment() >= 16;
115 // Like 'load', but always requires vector alignment.
116 def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
117 return cast<LoadSDNode>(N)->getAlignment() >= 16;
120 def alignedloadfsf32 : PatFrag<(ops node:$ptr),
121 (f32 (alignedload node:$ptr))>;
122 def alignedloadfsf64 : PatFrag<(ops node:$ptr),
123 (f64 (alignedload node:$ptr))>;
124 def alignedloadv4f32 : PatFrag<(ops node:$ptr),
125 (v4f32 (alignedload node:$ptr))>;
126 def alignedloadv2f64 : PatFrag<(ops node:$ptr),
127 (v2f64 (alignedload node:$ptr))>;
128 def alignedloadv4i32 : PatFrag<(ops node:$ptr),
129 (v4i32 (alignedload node:$ptr))>;
130 def alignedloadv2i64 : PatFrag<(ops node:$ptr),
131 (v2i64 (alignedload node:$ptr))>;
133 // Like 'load', but uses special alignment checks suitable for use in
134 // memory operands in most SSE instructions, which are required to
135 // be naturally aligned on some targets but not on others. If the subtarget
136 // allows unaligned accesses, match any load, though this may require
137 // setting a feature bit in the processor (on startup, for example).
138 // Opteron 10h and later implement such a feature.
139 def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
140 return Subtarget->hasVectorUAMem()
141 || cast<LoadSDNode>(N)->getAlignment() >= 16;
144 def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>;
145 def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>;
146 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
147 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
148 def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
149 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
150 def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
152 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
154 // FIXME: 8 byte alignment for mmx reads is not required
155 def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
156 return cast<LoadSDNode>(N)->getAlignment() >= 8;
159 def memopv8i8 : PatFrag<(ops node:$ptr), (v8i8 (memop64 node:$ptr))>;
160 def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
161 def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
162 def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
165 // Like 'store', but requires the non-temporal bit to be set
166 def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
167 (st node:$val, node:$ptr), [{
168 if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
169 return ST->isNonTemporal();
173 def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
174 (st node:$val, node:$ptr), [{
175 if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
176 return ST->isNonTemporal() && !ST->isTruncatingStore() &&
177 ST->getAddressingMode() == ISD::UNINDEXED &&
178 ST->getAlignment() >= 16;
182 def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
183 (st node:$val, node:$ptr), [{
184 if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
185 return ST->isNonTemporal() &&
186 ST->getAlignment() < 16;
190 def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
191 def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
192 def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
193 def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
194 def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
195 def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
197 def vzmovl_v2i64 : PatFrag<(ops node:$src),
198 (bitconvert (v2i64 (X86vzmovl
199 (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
200 def vzmovl_v4i32 : PatFrag<(ops node:$src),
201 (bitconvert (v4i32 (X86vzmovl
202 (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
204 def vzload_v2i64 : PatFrag<(ops node:$src),
205 (bitconvert (v2i64 (X86vzload node:$src)))>;
208 def fp32imm0 : PatLeaf<(f32 fpimm), [{
209 return N->isExactlyValue(+0.0);
212 // BYTE_imm - Transform bit immediates into byte immediates.
213 def BYTE_imm : SDNodeXForm<imm, [{
214 // Transformation function: imm >> 3
215 return getI32Imm(N->getZExtValue() >> 3);
218 // SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
220 def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
221 return getI8Imm(X86::getShuffleSHUFImmediate(N));
224 // SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to
226 def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{
227 return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
230 // SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to
232 def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
233 return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
236 // SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to
238 def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{
239 return getI8Imm(X86::getShufflePALIGNRImmediate(N));
242 def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
243 (vector_shuffle node:$lhs, node:$rhs), [{
244 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
245 return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
248 def movddup : PatFrag<(ops node:$lhs, node:$rhs),
249 (vector_shuffle node:$lhs, node:$rhs), [{
250 return X86::isMOVDDUPMask(cast<ShuffleVectorSDNode>(N));
253 def movhlps : PatFrag<(ops node:$lhs, node:$rhs),
254 (vector_shuffle node:$lhs, node:$rhs), [{
255 return X86::isMOVHLPSMask(cast<ShuffleVectorSDNode>(N));
258 def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs),
259 (vector_shuffle node:$lhs, node:$rhs), [{
260 return X86::isMOVHLPS_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
263 def movlhps : PatFrag<(ops node:$lhs, node:$rhs),
264 (vector_shuffle node:$lhs, node:$rhs), [{
265 return X86::isMOVLHPSMask(cast<ShuffleVectorSDNode>(N));
268 def movlp : PatFrag<(ops node:$lhs, node:$rhs),
269 (vector_shuffle node:$lhs, node:$rhs), [{
270 return X86::isMOVLPMask(cast<ShuffleVectorSDNode>(N));
273 def movl : PatFrag<(ops node:$lhs, node:$rhs),
274 (vector_shuffle node:$lhs, node:$rhs), [{
275 return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N));
278 def movshdup : PatFrag<(ops node:$lhs, node:$rhs),
279 (vector_shuffle node:$lhs, node:$rhs), [{
280 return X86::isMOVSHDUPMask(cast<ShuffleVectorSDNode>(N));
283 def movsldup : PatFrag<(ops node:$lhs, node:$rhs),
284 (vector_shuffle node:$lhs, node:$rhs), [{
285 return X86::isMOVSLDUPMask(cast<ShuffleVectorSDNode>(N));
288 def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
289 (vector_shuffle node:$lhs, node:$rhs), [{
290 return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
293 def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
294 (vector_shuffle node:$lhs, node:$rhs), [{
295 return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
298 def unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
299 (vector_shuffle node:$lhs, node:$rhs), [{
300 return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
303 def unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
304 (vector_shuffle node:$lhs, node:$rhs), [{
305 return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
308 def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
309 (vector_shuffle node:$lhs, node:$rhs), [{
310 return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
311 }], SHUFFLE_get_shuf_imm>;
313 def shufp : PatFrag<(ops node:$lhs, node:$rhs),
314 (vector_shuffle node:$lhs, node:$rhs), [{
315 return X86::isSHUFPMask(cast<ShuffleVectorSDNode>(N));
316 }], SHUFFLE_get_shuf_imm>;
318 def pshufhw : PatFrag<(ops node:$lhs, node:$rhs),
319 (vector_shuffle node:$lhs, node:$rhs), [{
320 return X86::isPSHUFHWMask(cast<ShuffleVectorSDNode>(N));
321 }], SHUFFLE_get_pshufhw_imm>;
323 def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
324 (vector_shuffle node:$lhs, node:$rhs), [{
325 return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
326 }], SHUFFLE_get_pshuflw_imm>;
328 def palign : PatFrag<(ops node:$lhs, node:$rhs),
329 (vector_shuffle node:$lhs, node:$rhs), [{
330 return X86::isPALIGNRMask(cast<ShuffleVectorSDNode>(N));
331 }], SHUFFLE_get_palign_imm>;
333 //===----------------------------------------------------------------------===//
334 // SSE scalar FP Instructions
335 //===----------------------------------------------------------------------===//
337 // CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after
338 // instruction selection into a branch sequence.
339 let Uses = [EFLAGS], usesCustomInserter = 1 in {
340 def CMOV_FR32 : I<0, Pseudo,
341 (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
342 "#CMOV_FR32 PSEUDO!",
343 [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
345 def CMOV_FR64 : I<0, Pseudo,
346 (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
347 "#CMOV_FR64 PSEUDO!",
348 [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
350 def CMOV_V4F32 : I<0, Pseudo,
351 (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
352 "#CMOV_V4F32 PSEUDO!",
354 (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
356 def CMOV_V2F64 : I<0, Pseudo,
357 (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
358 "#CMOV_V2F64 PSEUDO!",
360 (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
362 def CMOV_V2I64 : I<0, Pseudo,
363 (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
364 "#CMOV_V2I64 PSEUDO!",
366 (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
370 //===----------------------------------------------------------------------===//
372 //===----------------------------------------------------------------------===//
374 // Move Instructions. Register-to-register movss is not used for FR32
375 // register copies because it's a partial register update; FsMOVAPSrr is
376 // used instead. Register-to-register movss is not modeled as an INSERT_SUBREG
377 // because INSERT_SUBREG requires that the insert be implementable in terms of
378 // a copy, and just mentioned, we don't use movss for copies.
379 let Constraints = "$src1 = $dst" in
380 def MOVSSrr : SSI<0x10, MRMSrcReg,
381 (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
382 "movss\t{$src2, $dst|$dst, $src2}",
383 [(set (v4f32 VR128:$dst),
384 (movl VR128:$src1, (scalar_to_vector FR32:$src2)))]>;
386 // Extract the low 32-bit value from one vector and insert it into another.
387 let AddedComplexity = 15 in
388 def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
389 (MOVSSrr (v4f32 VR128:$src1),
390 (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
392 // Implicitly promote a 32-bit scalar to a vector.
393 def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
394 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
396 // Loading from memory automatically zeroing upper bits.
397 let canFoldAsLoad = 1, isReMaterializable = 1 in
398 def MOVSSrm : SSI<0x10, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
399 "movss\t{$src, $dst|$dst, $src}",
400 [(set FR32:$dst, (loadf32 addr:$src))]>;
402 // MOVSSrm zeros the high parts of the register; represent this
403 // with SUBREG_TO_REG.
404 let AddedComplexity = 20 in {
405 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
406 (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
407 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
408 (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
409 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
410 (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
413 // Store scalar value to memory.
414 def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
415 "movss\t{$src, $dst|$dst, $src}",
416 [(store FR32:$src, addr:$dst)]>;
418 // Extract and store.
419 def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
422 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
424 // Conversion instructions
425 def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
426 "cvttss2si\t{$src, $dst|$dst, $src}",
427 [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
428 def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
429 "cvttss2si\t{$src, $dst|$dst, $src}",
430 [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
431 def CVTSI2SSrr : SSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
432 "cvtsi2ss\t{$src, $dst|$dst, $src}",
433 [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
434 def CVTSI2SSrm : SSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
435 "cvtsi2ss\t{$src, $dst|$dst, $src}",
436 [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
438 // Match intrinsics which expect XMM operand(s).
439 def CVTSS2SIrr: SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
440 "cvtss2si{l}\t{$src, $dst|$dst, $src}", []>;
441 def CVTSS2SIrm: SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
442 "cvtss2si{l}\t{$src, $dst|$dst, $src}", []>;
444 def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
445 "cvtss2si\t{$src, $dst|$dst, $src}",
446 [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
447 def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
448 "cvtss2si\t{$src, $dst|$dst, $src}",
449 [(set GR32:$dst, (int_x86_sse_cvtss2si
450 (load addr:$src)))]>;
452 // Match intrinsics which expect MM and XMM operand(s).
453 def Int_CVTPS2PIrr : PSI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
454 "cvtps2pi\t{$src, $dst|$dst, $src}",
455 [(set VR64:$dst, (int_x86_sse_cvtps2pi VR128:$src))]>;
456 def Int_CVTPS2PIrm : PSI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
457 "cvtps2pi\t{$src, $dst|$dst, $src}",
458 [(set VR64:$dst, (int_x86_sse_cvtps2pi
459 (load addr:$src)))]>;
460 def Int_CVTTPS2PIrr: PSI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
461 "cvttps2pi\t{$src, $dst|$dst, $src}",
462 [(set VR64:$dst, (int_x86_sse_cvttps2pi VR128:$src))]>;
463 def Int_CVTTPS2PIrm: PSI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
464 "cvttps2pi\t{$src, $dst|$dst, $src}",
465 [(set VR64:$dst, (int_x86_sse_cvttps2pi
466 (load addr:$src)))]>;
467 let Constraints = "$src1 = $dst" in {
468 def Int_CVTPI2PSrr : PSI<0x2A, MRMSrcReg,
469 (outs VR128:$dst), (ins VR128:$src1, VR64:$src2),
470 "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
471 [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
473 def Int_CVTPI2PSrm : PSI<0x2A, MRMSrcMem,
474 (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
475 "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
476 [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
477 (load addr:$src2)))]>;
480 // Aliases for intrinsics
481 def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
482 "cvttss2si\t{$src, $dst|$dst, $src}",
484 (int_x86_sse_cvttss2si VR128:$src))]>;
485 def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
486 "cvttss2si\t{$src, $dst|$dst, $src}",
488 (int_x86_sse_cvttss2si(load addr:$src)))]>;
490 let Constraints = "$src1 = $dst" in {
491 def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
492 (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
493 "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
494 [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
496 def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
497 (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
498 "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
499 [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
500 (loadi32 addr:$src2)))]>;
503 // Comparison instructions
504 let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
505 def CMPSSrr : SSIi8<0xC2, MRMSrcReg,
506 (outs FR32:$dst), (ins FR32:$src1, FR32:$src, SSECC:$cc),
507 "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
509 def CMPSSrm : SSIi8<0xC2, MRMSrcMem,
510 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, SSECC:$cc),
511 "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
513 // Accept explicit immediate argument form instead of comparison code.
514 let isAsmParserOnly = 1 in {
515 def CMPSSrr_alt : SSIi8<0xC2, MRMSrcReg,
516 (outs FR32:$dst), (ins FR32:$src1, FR32:$src, i8imm:$src2),
517 "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
519 def CMPSSrm_alt : SSIi8<0xC2, MRMSrcMem,
520 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, i8imm:$src2),
521 "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
525 let Defs = [EFLAGS] in {
526 def UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins FR32:$src1, FR32:$src2),
527 "ucomiss\t{$src2, $src1|$src1, $src2}",
528 [(set EFLAGS, (X86cmp FR32:$src1, FR32:$src2))]>;
529 def UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs), (ins FR32:$src1, f32mem:$src2),
530 "ucomiss\t{$src2, $src1|$src1, $src2}",
531 [(set EFLAGS, (X86cmp FR32:$src1, (loadf32 addr:$src2)))]>;
533 def COMISSrr: PSI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
534 "comiss\t{$src2, $src1|$src1, $src2}", []>;
535 def COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
536 "comiss\t{$src2, $src1|$src1, $src2}", []>;
540 // Aliases to match intrinsics which expect XMM operand(s).
541 let Constraints = "$src1 = $dst" in {
542 def Int_CMPSSrr : SSIi8<0xC2, MRMSrcReg,
544 (ins VR128:$src1, VR128:$src, SSECC:$cc),
545 "cmp${cc}ss\t{$src, $dst|$dst, $src}",
546 [(set VR128:$dst, (int_x86_sse_cmp_ss
548 VR128:$src, imm:$cc))]>;
549 def Int_CMPSSrm : SSIi8<0xC2, MRMSrcMem,
551 (ins VR128:$src1, f32mem:$src, SSECC:$cc),
552 "cmp${cc}ss\t{$src, $dst|$dst, $src}",
553 [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
554 (load addr:$src), imm:$cc))]>;
557 let Defs = [EFLAGS] in {
558 def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
559 "ucomiss\t{$src2, $src1|$src1, $src2}",
560 [(set EFLAGS, (X86ucomi (v4f32 VR128:$src1),
562 def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
563 "ucomiss\t{$src2, $src1|$src1, $src2}",
564 [(set EFLAGS, (X86ucomi (v4f32 VR128:$src1),
565 (load addr:$src2)))]>;
567 def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
568 "comiss\t{$src2, $src1|$src1, $src2}",
569 [(set EFLAGS, (X86comi (v4f32 VR128:$src1),
571 def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
572 "comiss\t{$src2, $src1|$src1, $src2}",
573 [(set EFLAGS, (X86comi (v4f32 VR128:$src1),
574 (load addr:$src2)))]>;
577 // Aliases of packed SSE1 instructions for scalar use. These all have names
578 // that start with 'Fs'.
580 // Alias instructions that map fld0 to pxor for sse.
581 let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
583 // FIXME: Set encoding to pseudo!
584 def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
585 [(set FR32:$dst, fp32imm0)]>,
586 Requires<[HasSSE1]>, TB, OpSize;
588 // Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
590 let neverHasSideEffects = 1 in
591 def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
592 "movaps\t{$src, $dst|$dst, $src}", []>;
594 // Alias instruction to load FR32 from f128mem using movaps. Upper bits are
596 let canFoldAsLoad = 1, isReMaterializable = 1 in
597 def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
598 "movaps\t{$src, $dst|$dst, $src}",
599 [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
601 /// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
603 multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
604 SDNode OpNode, int NoPat = 0,
605 bit MayLoad = 0, bit Commutable = 1> {
606 def PSrr : PSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
607 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
609 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))])> {
610 let isCommutable = Commutable;
613 def PDrr : PDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
614 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
616 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))])> {
617 let isCommutable = Commutable;
620 def PSrm : PSI<opc, MRMSrcMem, (outs FR32:$dst),
621 (ins FR32:$src1, f128mem:$src2),
622 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
624 [(set FR32:$dst, (OpNode FR32:$src1,
625 (memopfsf32 addr:$src2)))])> {
626 let mayLoad = MayLoad;
629 def PDrm : PDI<opc, MRMSrcMem, (outs FR64:$dst),
630 (ins FR64:$src1, f128mem:$src2),
631 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
633 [(set FR64:$dst, (OpNode FR64:$src1,
634 (memopfsf64 addr:$src2)))])> {
635 let mayLoad = MayLoad;
639 // Alias bitwise logical operations using SSE logical ops on packed FP values.
640 let Constraints = "$src1 = $dst" in {
641 defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand>;
642 defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for>;
643 defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor>;
645 let neverHasSideEffects = 1 in
646 defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef, 1, 1, 0>;
649 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
650 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
651 RegisterClass RC, X86MemOperand memop> {
652 let isCommutable = 1 in {
653 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
654 OpcodeStr, [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>;
656 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memop:$src2),
657 OpcodeStr, [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))]>;
660 /// basic_sse12_fp_binop_rm - SSE 1 & 2 binops come in both scalar and
663 /// In addition, we also have a special variant of the scalar form here to
664 /// represent the associated intrinsic operation. This form is unlike the
665 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
666 /// and leaves the top elements unmodified (therefore these cannot be commuted).
668 /// These three forms can each be reg+reg or reg+mem, so there are a total of
669 /// six "instructions".
671 let Constraints = "$src1 = $dst" in {
672 multiclass basic_sse12_fp_binop_rm<bits<8> opc, string OpcodeStr,
673 SDNode OpNode, bit Commutable = 0> {
675 let Constraints = "", isAsmParserOnly = 1, hasVEX_4VPrefix = 1 in {
676 // Scalar operation, reg+reg.
677 let Prefix = 12 /* XS */ in
678 defm V#NAME#SS : sse12_fp_scalar<opc,
679 !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
680 OpNode, FR32, f32mem>;
682 let Prefix = 11 /* XD */ in
683 defm V#NAME#SD : sse12_fp_scalar<opc,
684 !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
685 OpNode, FR64, f64mem>;
688 let Constraints = "$src1 = $dst" in {
689 // Scalar operation, reg+reg.
690 let Prefix = 12 /* XS */ in
691 defm SS : sse12_fp_scalar<opc,
692 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
693 OpNode, FR32, f32mem>;
694 let Prefix = 11 /* XD */ in
695 defm SD : sse12_fp_scalar<opc,
696 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
697 OpNode, FR64, f64mem>;
700 // Vector operation, reg+reg.
701 def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
702 (ins VR128:$src1, VR128:$src2),
703 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
704 [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
705 let isCommutable = Commutable;
708 def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
709 (ins VR128:$src1, VR128:$src2),
710 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
711 [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
712 let isCommutable = Commutable;
715 def V#NAME#PSrr : VPSI<opc, MRMSrcReg, (outs VR128:$dst),
716 (ins VR128:$src1, VR128:$src2),
717 !strconcat(OpcodeStr,
718 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
720 let isCommutable = Commutable;
721 let Constraints = "";
722 let isAsmParserOnly = 1;
725 def V#NAME#PDrr : VPDI<opc, MRMSrcReg, (outs VR128:$dst),
726 (ins VR128:$src1, VR128:$src2),
727 !strconcat(OpcodeStr,
728 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
730 let isCommutable = Commutable;
731 let Constraints = "";
732 let isAsmParserOnly = 1;
735 // Vector operation, reg+mem.
736 def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
737 (ins VR128:$src1, f128mem:$src2),
738 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
739 [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
741 def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
742 (ins VR128:$src1, f128mem:$src2),
743 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
744 [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
746 def V#NAME#PSrm : VPSI<opc, MRMSrcMem, (outs VR128:$dst),
747 (ins VR128:$src1, f128mem:$src2),
748 !strconcat(OpcodeStr,
749 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []> {
750 let Constraints = "";
751 let isAsmParserOnly = 1;
754 def V#NAME#PDrm : VPDI<opc, MRMSrcMem, (outs VR128:$dst),
755 (ins VR128:$src1, f128mem:$src2),
756 !strconcat(OpcodeStr,
757 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []> {
758 let Constraints = "";
759 let isAsmParserOnly = 1;
762 // Intrinsic operation, reg+reg.
763 def V#NAME#SSrr_Int : VSSI<opc, MRMSrcReg, (outs VR128:$dst),
764 (ins VR128:$src1, VR128:$src2),
765 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
766 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse_",
767 !strconcat(OpcodeStr, "_ss")) VR128:$src1,
769 // int_x86_sse_xxx_ss
770 let Constraints = "";
773 def V#NAME#SDrr_Int : VSDI<opc, MRMSrcReg, (outs VR128:$dst),
774 (ins VR128:$src1, VR128:$src2),
775 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
776 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse2_",
777 !strconcat(OpcodeStr, "_sd")) VR128:$src1,
779 // int_x86_sse2_xxx_sd
780 let Constraints = "";
783 def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
784 (ins VR128:$src1, VR128:$src2),
785 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
786 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse_",
787 !strconcat(OpcodeStr, "_ss")) VR128:$src1,
789 // int_x86_sse_xxx_ss
791 def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
792 (ins VR128:$src1, VR128:$src2),
793 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
794 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse2_",
795 !strconcat(OpcodeStr, "_sd")) VR128:$src1,
797 // int_x86_sse2_xxx_sd
799 // Intrinsic operation, reg+mem.
800 def V#NAME#SSrm_Int : VSSI<opc, MRMSrcMem, (outs VR128:$dst),
801 (ins VR128:$src1, ssmem:$src2),
802 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
803 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse_",
804 !strconcat(OpcodeStr, "_ss")) VR128:$src1,
805 sse_load_f32:$src2))]> {
806 // int_x86_sse_xxx_ss
807 let Constraints = "";
810 def V#NAME#SDrm_Int : VSDI<opc, MRMSrcMem, (outs VR128:$dst),
811 (ins VR128:$src1, sdmem:$src2),
812 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
813 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse2_",
814 !strconcat(OpcodeStr, "_sd")) VR128:$src1,
815 sse_load_f64:$src2))]> {
816 // int_x86_sse2_xxx_sd
817 let Constraints = "";
820 def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
821 (ins VR128:$src1, ssmem:$src2),
822 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
823 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse_",
824 !strconcat(OpcodeStr, "_ss")) VR128:$src1,
825 sse_load_f32:$src2))]>;
826 // int_x86_sse_xxx_ss
828 def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
829 (ins VR128:$src1, sdmem:$src2),
830 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
831 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse2_",
832 !strconcat(OpcodeStr, "_sd")) VR128:$src1,
833 sse_load_f64:$src2))]>;
834 // int_x86_sse2_xxx_sd
838 // Arithmetic instructions
839 defm ADD : basic_sse12_fp_binop_rm<0x58, "add", fadd, 1>;
840 defm MUL : basic_sse12_fp_binop_rm<0x59, "mul", fmul, 1>;
842 let isCommutable = 0 in {
843 defm SUB : basic_sse12_fp_binop_rm<0x5C, "sub", fsub>;
844 defm DIV : basic_sse12_fp_binop_rm<0x5E, "div", fdiv>;
847 /// sse12_fp_binop_rm - Other SSE 1 & 2 binops
849 /// This multiclass is like basic_sse12_fp_binop_rm, with the addition of
850 /// instructions for a full-vector intrinsic form. Operations that map
851 /// onto C operators don't use this form since they just use the plain
852 /// vector form instead of having a separate vector intrinsic form.
854 /// This provides a total of eight "instructions".
856 let Constraints = "$src1 = $dst" in {
857 multiclass sse12_fp_binop_rm<bits<8> opc, string OpcodeStr,
858 SDNode OpNode, bit Commutable = 0> {
860 // Scalar operation, reg+reg.
861 def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
862 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
863 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
864 let isCommutable = Commutable;
867 def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
868 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
869 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
870 let isCommutable = Commutable;
873 // Scalar operation, reg+mem.
874 def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
875 (ins FR32:$src1, f32mem:$src2),
876 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
877 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
879 def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
880 (ins FR64:$src1, f64mem:$src2),
881 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
882 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
884 // Vector operation, reg+reg.
885 def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
886 (ins VR128:$src1, VR128:$src2),
887 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
888 [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
889 let isCommutable = Commutable;
892 def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
893 (ins VR128:$src1, VR128:$src2),
894 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
895 [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
896 let isCommutable = Commutable;
899 // Vector operation, reg+mem.
900 def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
901 (ins VR128:$src1, f128mem:$src2),
902 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
903 [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
905 def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
906 (ins VR128:$src1, f128mem:$src2),
907 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
908 [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
910 // Intrinsic operation, reg+reg.
911 def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
912 (ins VR128:$src1, VR128:$src2),
913 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
914 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse_",
915 !strconcat(OpcodeStr, "_ss")) VR128:$src1,
917 // int_x86_sse_xxx_ss
918 let isCommutable = Commutable;
921 def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
922 (ins VR128:$src1, VR128:$src2),
923 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
924 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse2_",
925 !strconcat(OpcodeStr, "_sd")) VR128:$src1,
927 // int_x86_sse2_xxx_sd
928 let isCommutable = Commutable;
931 // Intrinsic operation, reg+mem.
932 def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
933 (ins VR128:$src1, ssmem:$src2),
934 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
935 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse_",
936 !strconcat(OpcodeStr, "_ss")) VR128:$src1,
937 sse_load_f32:$src2))]>;
938 // int_x86_sse_xxx_ss
940 def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
941 (ins VR128:$src1, sdmem:$src2),
942 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
943 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse2_",
944 !strconcat(OpcodeStr, "_sd")) VR128:$src1,
945 sse_load_f64:$src2))]>;
946 // int_x86_sse2_xxx_sd
948 // Vector intrinsic operation, reg+reg.
949 def PSrr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst),
950 (ins VR128:$src1, VR128:$src2),
951 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
952 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse_",
953 !strconcat(OpcodeStr, "_ps")) VR128:$src1,
955 // int_x86_sse_xxx_ps
956 let isCommutable = Commutable;
959 def PDrr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst),
960 (ins VR128:$src1, VR128:$src2),
961 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
962 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse2_",
963 !strconcat(OpcodeStr, "_pd")) VR128:$src1,
965 // int_x86_sse2_xxx_pd
966 let isCommutable = Commutable;
969 // Vector intrinsic operation, reg+mem.
970 def PSrm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst),
971 (ins VR128:$src1, f128mem:$src2),
972 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
973 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse_",
974 !strconcat(OpcodeStr, "_ps")) VR128:$src1,
975 (memopv4f32 addr:$src2)))]>;
976 // int_x86_sse_xxx_ps
978 def PDrm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst),
979 (ins VR128:$src1, f128mem:$src2),
980 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
981 [(set VR128:$dst, (!nameconcat<Intrinsic>("int_x86_sse2_",
982 !strconcat(OpcodeStr, "_pd")) VR128:$src1,
983 (memopv2f64 addr:$src2)))]>;
984 // int_x86_sse2_xxx_pd
988 defm MAX : sse12_fp_binop_rm<0x5F, "max", X86fmax>;
989 defm MIN : sse12_fp_binop_rm<0x5D, "min", X86fmin>;
991 //===----------------------------------------------------------------------===//
992 // SSE packed FP Instructions
995 let neverHasSideEffects = 1 in
996 def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
997 "movaps\t{$src, $dst|$dst, $src}", []>;
998 let canFoldAsLoad = 1, isReMaterializable = 1 in
999 def MOVAPSrm : PSI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1000 "movaps\t{$src, $dst|$dst, $src}",
1001 [(set VR128:$dst, (alignedloadv4f32 addr:$src))]>;
1003 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1004 "movaps\t{$src, $dst|$dst, $src}",
1005 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
1007 let neverHasSideEffects = 1 in
1008 def MOVUPSrr : PSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1009 "movups\t{$src, $dst|$dst, $src}", []>;
1010 let canFoldAsLoad = 1, isReMaterializable = 1 in
1011 def MOVUPSrm : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1012 "movups\t{$src, $dst|$dst, $src}",
1013 [(set VR128:$dst, (loadv4f32 addr:$src))]>;
1014 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1015 "movups\t{$src, $dst|$dst, $src}",
1016 [(store (v4f32 VR128:$src), addr:$dst)]>;
1018 // Intrinsic forms of MOVUPS load and store
1019 let canFoldAsLoad = 1, isReMaterializable = 1 in
1020 def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1021 "movups\t{$src, $dst|$dst, $src}",
1022 [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
1023 def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1024 "movups\t{$src, $dst|$dst, $src}",
1025 [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
1027 let Constraints = "$src1 = $dst" in {
1028 let AddedComplexity = 20 in {
1029 def MOVLPSrm : PSI<0x12, MRMSrcMem,
1030 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1031 "movlps\t{$src2, $dst|$dst, $src2}",
1034 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
1035 def MOVHPSrm : PSI<0x16, MRMSrcMem,
1036 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1037 "movhps\t{$src2, $dst|$dst, $src2}",
1039 (movlhps VR128:$src1,
1040 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
1041 } // AddedComplexity
1042 } // Constraints = "$src1 = $dst"
1045 def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1046 (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
1048 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1049 "movlps\t{$src, $dst|$dst, $src}",
1050 [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1051 (iPTR 0))), addr:$dst)]>;
1053 // v2f64 extract element 1 is always custom lowered to unpack high to low
1054 // and extract element 0 so the non-store version isn't too horrible.
1055 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1056 "movhps\t{$src, $dst|$dst, $src}",
1057 [(store (f64 (vector_extract
1058 (unpckh (bc_v2f64 (v4f32 VR128:$src)),
1059 (undef)), (iPTR 0))), addr:$dst)]>;
1061 let Constraints = "$src1 = $dst" in {
1062 let AddedComplexity = 20 in {
1063 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
1064 (ins VR128:$src1, VR128:$src2),
1065 "movlhps\t{$src2, $dst|$dst, $src2}",
1067 (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>;
1069 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
1070 (ins VR128:$src1, VR128:$src2),
1071 "movhlps\t{$src2, $dst|$dst, $src2}",
1073 (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
1074 } // AddedComplexity
1075 } // Constraints = "$src1 = $dst"
1077 let AddedComplexity = 20 in {
1078 def : Pat<(v4f32 (movddup VR128:$src, (undef))),
1079 (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
1080 def : Pat<(v2i64 (movddup VR128:$src, (undef))),
1081 (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
1088 /// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
1090 /// In addition, we also have a special variant of the scalar form here to
1091 /// represent the associated intrinsic operation. This form is unlike the
1092 /// plain scalar form, in that it takes an entire vector (instead of a
1093 /// scalar) and leaves the top elements undefined.
1095 /// And, we have a special variant form for a full-vector intrinsic form.
1097 /// These four forms can each have a reg or a mem operand, so there are a
1098 /// total of eight "instructions".
1100 multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
1104 bit Commutable = 0> {
1105 // Scalar operation, reg.
1106 def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
1107 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
1108 [(set FR32:$dst, (OpNode FR32:$src))]> {
1109 let isCommutable = Commutable;
1112 // Scalar operation, mem.
1113 def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
1114 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
1115 [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS,
1116 Requires<[HasSSE1, OptForSize]>;
1118 // Vector operation, reg.
1119 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1120 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
1121 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
1122 let isCommutable = Commutable;
1125 // Vector operation, mem.
1126 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1127 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
1128 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
1130 // Intrinsic operation, reg.
1131 def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1132 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
1133 [(set VR128:$dst, (F32Int VR128:$src))]> {
1134 let isCommutable = Commutable;
1137 // Intrinsic operation, mem.
1138 def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
1139 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
1140 [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
1142 // Vector intrinsic operation, reg
1143 def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1144 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
1145 [(set VR128:$dst, (V4F32Int VR128:$src))]> {
1146 let isCommutable = Commutable;
1149 // Vector intrinsic operation, mem
1150 def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1151 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
1152 [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
1156 defm SQRT : sse1_fp_unop_rm<0x51, "sqrt", fsqrt,
1157 int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
1159 // Reciprocal approximations. Note that these typically require refinement
1160 // in order to obtain suitable precision.
1161 defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
1162 int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
1163 defm RCP : sse1_fp_unop_rm<0x53, "rcp", X86frcp,
1164 int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
1166 /// sse12_fp_pack_logical - SSE 1 & 2 packed FP logical ops
1168 multiclass sse12_fp_pack_logical<bits<8> opc, string OpcodeStr,
1169 SDNode OpNode, int HasPat = 0,
1171 list<list<dag>> Pattern = []> {
1172 def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
1173 (ins VR128:$src1, VR128:$src2),
1174 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
1175 !if(HasPat, Pattern[0],
1176 [(set VR128:$dst, (v2i64 (OpNode VR128:$src1,
1178 { let isCommutable = Commutable; }
1180 def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
1181 (ins VR128:$src1, VR128:$src2),
1182 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
1183 !if(HasPat, Pattern[1],
1184 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
1185 (bc_v2i64 (v2f64 VR128:$src2))))])>
1186 { let isCommutable = Commutable; }
1188 def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
1189 (ins VR128:$src1, f128mem:$src2),
1190 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
1191 !if(HasPat, Pattern[2],
1192 [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
1193 (memopv2i64 addr:$src2)))])>;
1195 def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
1196 (ins VR128:$src1, f128mem:$src2),
1197 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
1198 !if(HasPat, Pattern[3],
1199 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
1200 (memopv2i64 addr:$src2)))])>;
1204 let Constraints = "$src1 = $dst" in {
1205 defm AND : sse12_fp_pack_logical<0x54, "and", and>;
1206 defm OR : sse12_fp_pack_logical<0x56, "or", or>;
1207 defm XOR : sse12_fp_pack_logical<0x57, "xor", xor>;
1208 defm ANDN : sse12_fp_pack_logical<0x55, "andn", undef /* dummy */, 1, 0, [
1210 [(set VR128:$dst, (v2i64 (and (xor VR128:$src1,
1211 (bc_v2i64 (v4i32 immAllOnesV))),
1214 [(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
1215 (bc_v2i64 (v2f64 VR128:$src2))))],
1217 [(set VR128:$dst, (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)),
1218 (bc_v2i64 (v4i32 immAllOnesV))),
1219 (memopv2i64 addr:$src2))))],
1221 [(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
1222 (memopv2i64 addr:$src2)))]]>;
1225 let Constraints = "$src1 = $dst" in {
1226 def CMPPSrri : PSIi8<0xC2, MRMSrcReg,
1227 (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
1228 "cmp${cc}ps\t{$src, $dst|$dst, $src}",
1229 [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
1230 VR128:$src, imm:$cc))]>;
1231 def CMPPSrmi : PSIi8<0xC2, MRMSrcMem,
1232 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
1233 "cmp${cc}ps\t{$src, $dst|$dst, $src}",
1234 [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
1235 (memop addr:$src), imm:$cc))]>;
1237 // Accept explicit immediate argument form instead of comparison code.
1238 let isAsmParserOnly = 1 in {
1239 def CMPPSrri_alt : PSIi8<0xC2, MRMSrcReg,
1240 (outs VR128:$dst), (ins VR128:$src1, VR128:$src, i8imm:$src2),
1241 "cmpps\t{$src2, $src, $dst|$dst, $src, $src}", []>;
1242 def CMPPSrmi_alt : PSIi8<0xC2, MRMSrcMem,
1243 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, i8imm:$src2),
1244 "cmpps\t{$src2, $src, $dst|$dst, $src, $src}", []>;
1247 def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
1248 (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
1249 def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
1250 (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
1252 // Shuffle and unpack instructions
1253 let Constraints = "$src1 = $dst" in {
1254 let isConvertibleToThreeAddress = 1 in // Convert to pshufd
1255 def SHUFPSrri : PSIi8<0xC6, MRMSrcReg,
1256 (outs VR128:$dst), (ins VR128:$src1,
1257 VR128:$src2, i8imm:$src3),
1258 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
1260 (v4f32 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
1261 def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem,
1262 (outs VR128:$dst), (ins VR128:$src1,
1263 f128mem:$src2, i8imm:$src3),
1264 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
1267 VR128:$src1, (memopv4f32 addr:$src2))))]>;
1269 let AddedComplexity = 10 in {
1270 def UNPCKHPSrr : PSI<0x15, MRMSrcReg,
1271 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1272 "unpckhps\t{$src2, $dst|$dst, $src2}",
1274 (v4f32 (unpckh VR128:$src1, VR128:$src2)))]>;
1275 def UNPCKHPSrm : PSI<0x15, MRMSrcMem,
1276 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1277 "unpckhps\t{$src2, $dst|$dst, $src2}",
1279 (v4f32 (unpckh VR128:$src1,
1280 (memopv4f32 addr:$src2))))]>;
1282 def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
1283 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1284 "unpcklps\t{$src2, $dst|$dst, $src2}",
1286 (v4f32 (unpckl VR128:$src1, VR128:$src2)))]>;
1287 def UNPCKLPSrm : PSI<0x14, MRMSrcMem,
1288 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1289 "unpcklps\t{$src2, $dst|$dst, $src2}",
1291 (unpckl VR128:$src1, (memopv4f32 addr:$src2)))]>;
1292 } // AddedComplexity
1293 } // Constraints = "$src1 = $dst"
1296 def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
1297 "movmskps\t{$src, $dst|$dst, $src}",
1298 [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
1299 def MOVMSKPDrr : PDI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
1300 "movmskpd\t{$src, $dst|$dst, $src}",
1301 [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
1303 // Prefetch intrinsic.
1304 def PREFETCHT0 : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
1305 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>;
1306 def PREFETCHT1 : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
1307 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>;
1308 def PREFETCHT2 : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
1309 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>;
1310 def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
1311 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
1313 // Non-temporal stores
1314 def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
1315 "movntps\t{$src, $dst|$dst, $src}",
1316 [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
1318 let AddedComplexity = 400 in { // Prefer non-temporal versions
1319 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1320 "movntps\t{$src, $dst|$dst, $src}",
1321 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
1323 def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1324 "movntdq\t{$src, $dst|$dst, $src}",
1325 [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>;
1327 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
1328 "movnti\t{$src, $dst|$dst, $src}",
1329 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
1330 TB, Requires<[HasSSE2]>;
1332 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
1333 "movnti\t{$src, $dst|$dst, $src}",
1334 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
1335 TB, Requires<[HasSSE2]>;
1338 // Load, store, and memory fence
1339 def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
1340 TB, Requires<[HasSSE1]>;
1343 def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
1344 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
1345 def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
1346 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
1348 // Alias instructions that map zero vector to pxor / xorp* for sse.
1349 // We set canFoldAsLoad because this can be converted to a constant-pool
1350 // load of an all-zeros value if folding it would be beneficial.
1351 // FIXME: Change encoding to pseudo!
1352 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
1353 isCodeGenOnly = 1 in {
1354 def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
1355 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
1356 def V_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
1357 [(set VR128:$dst, (v2f64 immAllZerosV))]>;
1358 let ExeDomain = SSEPackedInt in
1359 def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
1360 [(set VR128:$dst, (v4i32 immAllZerosV))]>;
1363 def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
1364 def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>;
1365 def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
1367 def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
1368 (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
1370 //===---------------------------------------------------------------------===//
1371 // SSE2 Instructions
1372 //===---------------------------------------------------------------------===//
1374 // Move Instructions. Register-to-register movsd is not used for FR64
1375 // register copies because it's a partial register update; FsMOVAPDrr is
1376 // used instead. Register-to-register movsd is not modeled as an INSERT_SUBREG
1377 // because INSERT_SUBREG requires that the insert be implementable in terms of
1378 // a copy, and just mentioned, we don't use movsd for copies.
1379 let Constraints = "$src1 = $dst" in
1380 def MOVSDrr : SDI<0x10, MRMSrcReg,
1381 (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
1382 "movsd\t{$src2, $dst|$dst, $src2}",
1383 [(set (v2f64 VR128:$dst),
1384 (movl VR128:$src1, (scalar_to_vector FR64:$src2)))]>;
1386 // Extract the low 64-bit value from one vector and insert it into another.
1387 let AddedComplexity = 15 in
1388 def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
1389 (MOVSDrr (v2f64 VR128:$src1),
1390 (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
1392 // Implicitly promote a 64-bit scalar to a vector.
1393 def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
1394 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
1396 // Loading from memory automatically zeroing upper bits.
1397 let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 20 in
1398 def MOVSDrm : SDI<0x10, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
1399 "movsd\t{$src, $dst|$dst, $src}",
1400 [(set FR64:$dst, (loadf64 addr:$src))]>;
1402 // MOVSDrm zeros the high parts of the register; represent this
1403 // with SUBREG_TO_REG.
1404 let AddedComplexity = 20 in {
1405 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
1406 (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
1407 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
1408 (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
1409 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
1410 (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
1411 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
1412 (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
1413 def : Pat<(v2f64 (X86vzload addr:$src)),
1414 (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
1417 // Store scalar value to memory.
1418 def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
1419 "movsd\t{$src, $dst|$dst, $src}",
1420 [(store FR64:$src, addr:$dst)]>;
1422 // Extract and store.
1423 def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
1426 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
1428 // Conversion instructions
1429 def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
1430 "cvttsd2si\t{$src, $dst|$dst, $src}",
1431 [(set GR32:$dst, (fp_to_sint FR64:$src))]>;
1432 def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src),
1433 "cvttsd2si\t{$src, $dst|$dst, $src}",
1434 [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
1435 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1436 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1437 [(set FR32:$dst, (fround FR64:$src))]>;
1438 def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1439 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1440 [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD,
1441 Requires<[HasSSE2, OptForSize]>;
1442 def CVTSI2SDrr : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src),
1443 "cvtsi2sd\t{$src, $dst|$dst, $src}",
1444 [(set FR64:$dst, (sint_to_fp GR32:$src))]>;
1445 def CVTSI2SDrm : SDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i32mem:$src),
1446 "cvtsi2sd\t{$src, $dst|$dst, $src}",
1447 [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
1449 def CVTPD2DQrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1450 "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
1451 def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1452 "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
1453 def CVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1454 "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
1455 def CVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1456 "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
1457 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1458 "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
1459 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1460 "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
1461 def CVTDQ2PSrr : PSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1462 "cvtdq2ps\t{$src, $dst|$dst, $src}", []>;
1463 def CVTDQ2PSrm : PSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1464 "cvtdq2ps\t{$src, $dst|$dst, $src}", []>;
1465 def COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
1466 "comisd\t{$src2, $src1|$src1, $src2}", []>;
1467 def COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
1468 "comisd\t{$src2, $src1|$src1, $src2}", []>;
1470 // SSE2 instructions with XS prefix
1471 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1472 "cvtss2sd\t{$src, $dst|$dst, $src}",
1473 [(set FR64:$dst, (fextend FR32:$src))]>, XS,
1474 Requires<[HasSSE2]>;
1475 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1476 "cvtss2sd\t{$src, $dst|$dst, $src}",
1477 [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
1478 Requires<[HasSSE2, OptForSize]>;
1480 def : Pat<(extloadf32 addr:$src),
1481 (CVTSS2SDrr (MOVSSrm addr:$src))>,
1482 Requires<[HasSSE2, OptForSpeed]>;
1484 // Match intrinsics which expect XMM operand(s).
1485 def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
1486 "cvtsd2si\t{$src, $dst|$dst, $src}",
1487 [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>;
1488 def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
1489 "cvtsd2si\t{$src, $dst|$dst, $src}",
1490 [(set GR32:$dst, (int_x86_sse2_cvtsd2si
1491 (load addr:$src)))]>;
1493 // Match intrinsics which expect MM and XMM operand(s).
1494 def Int_CVTPD2PIrr : PDI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
1495 "cvtpd2pi\t{$src, $dst|$dst, $src}",
1496 [(set VR64:$dst, (int_x86_sse_cvtpd2pi VR128:$src))]>;
1497 def Int_CVTPD2PIrm : PDI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
1498 "cvtpd2pi\t{$src, $dst|$dst, $src}",
1499 [(set VR64:$dst, (int_x86_sse_cvtpd2pi
1500 (memop addr:$src)))]>;
1501 def Int_CVTTPD2PIrr: PDI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
1502 "cvttpd2pi\t{$src, $dst|$dst, $src}",
1503 [(set VR64:$dst, (int_x86_sse_cvttpd2pi VR128:$src))]>;
1504 def Int_CVTTPD2PIrm: PDI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
1505 "cvttpd2pi\t{$src, $dst|$dst, $src}",
1506 [(set VR64:$dst, (int_x86_sse_cvttpd2pi
1507 (memop addr:$src)))]>;
1508 def Int_CVTPI2PDrr : PDI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
1509 "cvtpi2pd\t{$src, $dst|$dst, $src}",
1510 [(set VR128:$dst, (int_x86_sse_cvtpi2pd VR64:$src))]>;
1511 def Int_CVTPI2PDrm : PDI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1512 "cvtpi2pd\t{$src, $dst|$dst, $src}",
1513 [(set VR128:$dst, (int_x86_sse_cvtpi2pd
1514 (load addr:$src)))]>;
1516 // Aliases for intrinsics
1517 def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
1518 "cvttsd2si\t{$src, $dst|$dst, $src}",
1520 (int_x86_sse2_cvttsd2si VR128:$src))]>;
1521 def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
1522 "cvttsd2si\t{$src, $dst|$dst, $src}",
1523 [(set GR32:$dst, (int_x86_sse2_cvttsd2si
1524 (load addr:$src)))]>;
1526 // Comparison instructions
1527 let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
1528 def CMPSDrr : SDIi8<0xC2, MRMSrcReg,
1529 (outs FR64:$dst), (ins FR64:$src1, FR64:$src, SSECC:$cc),
1530 "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
1532 def CMPSDrm : SDIi8<0xC2, MRMSrcMem,
1533 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, SSECC:$cc),
1534 "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
1536 // Accept explicit immediate argument form instead of comparison code.
1537 let isAsmParserOnly = 1 in {
1538 def CMPSDrr_alt : SDIi8<0xC2, MRMSrcReg,
1539 (outs FR64:$dst), (ins FR64:$src1, FR64:$src, i8imm:$src2),
1540 "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
1542 def CMPSDrm_alt : SDIi8<0xC2, MRMSrcMem,
1543 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, i8imm:$src2),
1544 "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
1548 let Defs = [EFLAGS] in {
1549 def UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins FR64:$src1, FR64:$src2),
1550 "ucomisd\t{$src2, $src1|$src1, $src2}",
1551 [(set EFLAGS, (X86cmp FR64:$src1, FR64:$src2))]>;
1552 def UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs), (ins FR64:$src1, f64mem:$src2),
1553 "ucomisd\t{$src2, $src1|$src1, $src2}",
1554 [(set EFLAGS, (X86cmp FR64:$src1, (loadf64 addr:$src2)))]>;
1555 } // Defs = [EFLAGS]
1557 // Aliases to match intrinsics which expect XMM operand(s).
1558 let Constraints = "$src1 = $dst" in {
1559 def Int_CMPSDrr : SDIi8<0xC2, MRMSrcReg,
1561 (ins VR128:$src1, VR128:$src, SSECC:$cc),
1562 "cmp${cc}sd\t{$src, $dst|$dst, $src}",
1563 [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
1564 VR128:$src, imm:$cc))]>;
1565 def Int_CMPSDrm : SDIi8<0xC2, MRMSrcMem,
1567 (ins VR128:$src1, f64mem:$src, SSECC:$cc),
1568 "cmp${cc}sd\t{$src, $dst|$dst, $src}",
1569 [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
1570 (load addr:$src), imm:$cc))]>;
1573 let Defs = [EFLAGS] in {
1574 def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
1575 "ucomisd\t{$src2, $src1|$src1, $src2}",
1576 [(set EFLAGS, (X86ucomi (v2f64 VR128:$src1),
1578 def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
1579 "ucomisd\t{$src2, $src1|$src1, $src2}",
1580 [(set EFLAGS, (X86ucomi (v2f64 VR128:$src1),
1581 (load addr:$src2)))]>;
1583 def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
1584 "comisd\t{$src2, $src1|$src1, $src2}",
1585 [(set EFLAGS, (X86comi (v2f64 VR128:$src1),
1587 def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
1588 "comisd\t{$src2, $src1|$src1, $src2}",
1589 [(set EFLAGS, (X86comi (v2f64 VR128:$src1),
1590 (load addr:$src2)))]>;
1591 } // Defs = [EFLAGS]
1593 // Aliases of packed SSE2 instructions for scalar use. These all have names
1594 // that start with 'Fs'.
1596 // Alias instructions that map fld0 to pxor for sse.
1597 let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
1598 canFoldAsLoad = 1 in
1599 def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
1600 [(set FR64:$dst, fpimm0)]>,
1601 Requires<[HasSSE2]>, TB, OpSize;
1603 // Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
1605 let neverHasSideEffects = 1 in
1606 def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
1607 "movapd\t{$src, $dst|$dst, $src}", []>;
1609 // Alias instruction to load FR64 from f128mem using movapd. Upper bits are
1611 let canFoldAsLoad = 1, isReMaterializable = 1 in
1612 def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1613 "movapd\t{$src, $dst|$dst, $src}",
1614 [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
1616 //===---------------------------------------------------------------------===//
1617 // SSE packed FP Instructions
1619 // Move Instructions
1620 let neverHasSideEffects = 1 in
1621 def MOVAPDrr : PDI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1622 "movapd\t{$src, $dst|$dst, $src}", []>;
1623 let canFoldAsLoad = 1, isReMaterializable = 1 in
1624 def MOVAPDrm : PDI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1625 "movapd\t{$src, $dst|$dst, $src}",
1626 [(set VR128:$dst, (alignedloadv2f64 addr:$src))]>;
1628 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1629 "movapd\t{$src, $dst|$dst, $src}",
1630 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
1632 let neverHasSideEffects = 1 in
1633 def MOVUPDrr : PDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1634 "movupd\t{$src, $dst|$dst, $src}", []>;
1635 let canFoldAsLoad = 1 in
1636 def MOVUPDrm : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1637 "movupd\t{$src, $dst|$dst, $src}",
1638 [(set VR128:$dst, (loadv2f64 addr:$src))]>;
1639 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1640 "movupd\t{$src, $dst|$dst, $src}",
1641 [(store (v2f64 VR128:$src), addr:$dst)]>;
1643 // Intrinsic forms of MOVUPD load and store
1644 def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1645 "movupd\t{$src, $dst|$dst, $src}",
1646 [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
1647 def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1648 "movupd\t{$src, $dst|$dst, $src}",
1649 [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
1651 let Constraints = "$src1 = $dst" in {
1652 let AddedComplexity = 20 in {
1653 def MOVLPDrm : PDI<0x12, MRMSrcMem,
1654 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1655 "movlpd\t{$src2, $dst|$dst, $src2}",
1657 (v2f64 (movlp VR128:$src1,
1658 (scalar_to_vector (loadf64 addr:$src2)))))]>;
1659 def MOVHPDrm : PDI<0x16, MRMSrcMem,
1660 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1661 "movhpd\t{$src2, $dst|$dst, $src2}",
1663 (v2f64 (movlhps VR128:$src1,
1664 (scalar_to_vector (loadf64 addr:$src2)))))]>;
1665 } // AddedComplexity
1666 } // Constraints = "$src1 = $dst"
1668 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1669 "movlpd\t{$src, $dst|$dst, $src}",
1670 [(store (f64 (vector_extract (v2f64 VR128:$src),
1671 (iPTR 0))), addr:$dst)]>;
1673 // v2f64 extract element 1 is always custom lowered to unpack high to low
1674 // and extract element 0 so the non-store version isn't too horrible.
1675 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1676 "movhpd\t{$src, $dst|$dst, $src}",
1677 [(store (f64 (vector_extract
1678 (v2f64 (unpckh VR128:$src, (undef))),
1679 (iPTR 0))), addr:$dst)]>;
1681 // SSE2 instructions without OpSize prefix
1682 def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1683 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1684 [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
1685 TB, Requires<[HasSSE2]>;
1686 def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
1687 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1688 [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
1689 (bitconvert (memopv2i64 addr:$src))))]>,
1690 TB, Requires<[HasSSE2]>;
1692 // SSE2 instructions with XS prefix
1693 def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1694 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1695 [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
1696 XS, Requires<[HasSSE2]>;
1697 def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1698 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1699 [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
1700 (bitconvert (memopv2i64 addr:$src))))]>,
1701 XS, Requires<[HasSSE2]>;
1703 def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1704 "cvtps2dq\t{$src, $dst|$dst, $src}",
1705 [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
1706 def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1707 "cvtps2dq\t{$src, $dst|$dst, $src}",
1708 [(set VR128:$dst, (int_x86_sse2_cvtps2dq
1709 (memop addr:$src)))]>;
1710 // SSE2 packed instructions with XS prefix
1711 def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1712 "cvttps2dq\t{$src, $dst|$dst, $src}", []>;
1713 def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1714 "cvttps2dq\t{$src, $dst|$dst, $src}", []>;
1716 def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1717 "cvttps2dq\t{$src, $dst|$dst, $src}",
1719 (int_x86_sse2_cvttps2dq VR128:$src))]>,
1720 XS, Requires<[HasSSE2]>;
1721 def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1722 "cvttps2dq\t{$src, $dst|$dst, $src}",
1723 [(set VR128:$dst, (int_x86_sse2_cvttps2dq
1724 (memop addr:$src)))]>,
1725 XS, Requires<[HasSSE2]>;
1727 // SSE2 packed instructions with XD prefix
1728 def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1729 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1730 [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
1731 XD, Requires<[HasSSE2]>;
1732 def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1733 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1734 [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
1735 (memop addr:$src)))]>,
1736 XD, Requires<[HasSSE2]>;
1738 def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1739 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1740 [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
1741 def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1742 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1743 [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
1744 (memop addr:$src)))]>;
1746 // SSE2 instructions without OpSize prefix
1747 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1748 "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
1749 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1750 "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
1752 def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1753 "cvtps2pd\t{$src, $dst|$dst, $src}",
1754 [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
1755 TB, Requires<[HasSSE2]>;
1756 def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1757 "cvtps2pd\t{$src, $dst|$dst, $src}",
1758 [(set VR128:$dst, (int_x86_sse2_cvtps2pd
1759 (load addr:$src)))]>,
1760 TB, Requires<[HasSSE2]>;
1762 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1763 "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
1764 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1765 "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
1768 def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1769 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1770 [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
1771 def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1772 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1773 [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
1774 (memop addr:$src)))]>;
1776 // Match intrinsics which expect XMM operand(s).
1777 // Aliases for intrinsics
1778 let Constraints = "$src1 = $dst" in {
1779 def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg,
1780 (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
1781 "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
1782 [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
1784 def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem,
1785 (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
1786 "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
1787 [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
1788 (loadi32 addr:$src2)))]>;
1789 def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
1790 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1791 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1792 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
1794 def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
1795 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1796 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1797 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
1798 (load addr:$src2)))]>;
1799 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
1800 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1801 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1802 [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
1803 VR128:$src2))]>, XS,
1804 Requires<[HasSSE2]>;
1805 def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
1806 (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
1807 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1808 [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
1809 (load addr:$src2)))]>, XS,
1810 Requires<[HasSSE2]>;
1815 /// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
1817 /// In addition, we also have a special variant of the scalar form here to
1818 /// represent the associated intrinsic operation. This form is unlike the
1819 /// plain scalar form, in that it takes an entire vector (instead of a
1820 /// scalar) and leaves the top elements undefined.
1822 /// And, we have a special variant form for a full-vector intrinsic form.
1824 /// These four forms can each have a reg or a mem operand, so there are a
1825 /// total of eight "instructions".
1827 multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
1831 bit Commutable = 0> {
1832 // Scalar operation, reg.
1833 def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
1834 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
1835 [(set FR64:$dst, (OpNode FR64:$src))]> {
1836 let isCommutable = Commutable;
1839 // Scalar operation, mem.
1840 def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
1841 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
1842 [(set FR64:$dst, (OpNode (load addr:$src)))]>;
1844 // Vector operation, reg.
1845 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1846 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
1847 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
1848 let isCommutable = Commutable;
1851 // Vector operation, mem.
1852 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1853 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
1854 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
1856 // Intrinsic operation, reg.
1857 def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1858 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
1859 [(set VR128:$dst, (F64Int VR128:$src))]> {
1860 let isCommutable = Commutable;
1863 // Intrinsic operation, mem.
1864 def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
1865 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
1866 [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
1868 // Vector intrinsic operation, reg
1869 def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1870 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
1871 [(set VR128:$dst, (V2F64Int VR128:$src))]> {
1872 let isCommutable = Commutable;
1875 // Vector intrinsic operation, mem
1876 def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1877 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
1878 [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
1882 defm SQRT : sse2_fp_unop_rm<0x51, "sqrt", fsqrt,
1883 int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
1885 // There is no f64 version of the reciprocal approximation instructions.
1887 let Constraints = "$src1 = $dst" in {
1888 def CMPPDrri : PDIi8<0xC2, MRMSrcReg,
1889 (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
1890 "cmp${cc}pd\t{$src, $dst|$dst, $src}",
1891 [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
1892 VR128:$src, imm:$cc))]>;
1893 def CMPPDrmi : PDIi8<0xC2, MRMSrcMem,
1894 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
1895 "cmp${cc}pd\t{$src, $dst|$dst, $src}",
1896 [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
1897 (memop addr:$src), imm:$cc))]>;
1899 // Accept explicit immediate argument form instead of comparison code.
1900 let isAsmParserOnly = 1 in {
1901 def CMPPDrri_alt : PDIi8<0xC2, MRMSrcReg,
1902 (outs VR128:$dst), (ins VR128:$src1, VR128:$src, i8imm:$src2),
1903 "cmppd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
1904 def CMPPDrmi_alt : PDIi8<0xC2, MRMSrcMem,
1905 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, i8imm:$src2),
1906 "cmppd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
1909 def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
1910 (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
1911 def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
1912 (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
1914 // Shuffle and unpack instructions
1915 let Constraints = "$src1 = $dst" in {
1916 def SHUFPDrri : PDIi8<0xC6, MRMSrcReg,
1917 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3),
1918 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
1920 (v2f64 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
1921 def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem,
1922 (outs VR128:$dst), (ins VR128:$src1,
1923 f128mem:$src2, i8imm:$src3),
1924 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
1927 VR128:$src1, (memopv2f64 addr:$src2))))]>;
1929 let AddedComplexity = 10 in {
1930 def UNPCKHPDrr : PDI<0x15, MRMSrcReg,
1931 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1932 "unpckhpd\t{$src2, $dst|$dst, $src2}",
1934 (v2f64 (unpckh VR128:$src1, VR128:$src2)))]>;
1935 def UNPCKHPDrm : PDI<0x15, MRMSrcMem,
1936 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1937 "unpckhpd\t{$src2, $dst|$dst, $src2}",
1939 (v2f64 (unpckh VR128:$src1,
1940 (memopv2f64 addr:$src2))))]>;
1942 def UNPCKLPDrr : PDI<0x14, MRMSrcReg,
1943 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1944 "unpcklpd\t{$src2, $dst|$dst, $src2}",
1946 (v2f64 (unpckl VR128:$src1, VR128:$src2)))]>;
1947 def UNPCKLPDrm : PDI<0x14, MRMSrcMem,
1948 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
1949 "unpcklpd\t{$src2, $dst|$dst, $src2}",
1951 (unpckl VR128:$src1, (memopv2f64 addr:$src2)))]>;
1952 } // AddedComplexity
1953 } // Constraints = "$src1 = $dst"
1956 //===---------------------------------------------------------------------===//
1957 // SSE integer instructions
1958 let ExeDomain = SSEPackedInt in {
1960 // Move Instructions
1961 let neverHasSideEffects = 1 in
1962 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1963 "movdqa\t{$src, $dst|$dst, $src}", []>;
1964 let canFoldAsLoad = 1, mayLoad = 1 in
1965 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
1966 "movdqa\t{$src, $dst|$dst, $src}",
1967 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
1969 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
1970 "movdqa\t{$src, $dst|$dst, $src}",
1971 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
1972 let canFoldAsLoad = 1, mayLoad = 1 in
1973 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
1974 "movdqu\t{$src, $dst|$dst, $src}",
1975 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
1976 XS, Requires<[HasSSE2]>;
1978 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
1979 "movdqu\t{$src, $dst|$dst, $src}",
1980 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
1981 XS, Requires<[HasSSE2]>;
1983 // Intrinsic forms of MOVDQU load and store
1984 let canFoldAsLoad = 1 in
1985 def MOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
1986 "movdqu\t{$src, $dst|$dst, $src}",
1987 [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
1988 XS, Requires<[HasSSE2]>;
1989 def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
1990 "movdqu\t{$src, $dst|$dst, $src}",
1991 [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
1992 XS, Requires<[HasSSE2]>;
1994 let Constraints = "$src1 = $dst" in {
1996 multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
1997 bit Commutable = 0> {
1998 def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
1999 (ins VR128:$src1, VR128:$src2),
2000 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2001 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
2002 let isCommutable = Commutable;
2004 def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
2005 (ins VR128:$src1, i128mem:$src2),
2006 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2007 [(set VR128:$dst, (IntId VR128:$src1,
2008 (bitconvert (memopv2i64
2012 multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
2014 Intrinsic IntId, Intrinsic IntId2> {
2015 def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
2016 (ins VR128:$src1, VR128:$src2),
2017 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2018 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
2019 def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
2020 (ins VR128:$src1, i128mem:$src2),
2021 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2022 [(set VR128:$dst, (IntId VR128:$src1,
2023 (bitconvert (memopv2i64 addr:$src2))))]>;
2024 def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst),
2025 (ins VR128:$src1, i32i8imm:$src2),
2026 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2027 [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
2030 /// PDI_binop_rm - Simple SSE2 binary operator.
2031 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2032 ValueType OpVT, bit Commutable = 0> {
2033 def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
2034 (ins VR128:$src1, VR128:$src2),
2035 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2036 [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
2037 let isCommutable = Commutable;
2039 def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
2040 (ins VR128:$src1, i128mem:$src2),
2041 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2042 [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
2043 (bitconvert (memopv2i64 addr:$src2)))))]>;
2046 /// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
2048 /// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
2049 /// to collapse (bitconvert VT to VT) into its operand.
2051 multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
2052 bit Commutable = 0> {
2053 def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
2054 (ins VR128:$src1, VR128:$src2),
2055 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2056 [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
2057 let isCommutable = Commutable;
2059 def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
2060 (ins VR128:$src1, i128mem:$src2),
2061 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2062 [(set VR128:$dst, (OpNode VR128:$src1,
2063 (memopv2i64 addr:$src2)))]>;
2066 } // Constraints = "$src1 = $dst"
2067 } // ExeDomain = SSEPackedInt
2069 // 128-bit Integer Arithmetic
2071 defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
2072 defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
2073 defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
2074 defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
2076 defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
2077 defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
2078 defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
2079 defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
2081 defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
2082 defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
2083 defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
2084 defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
2086 defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
2087 defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
2088 defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
2089 defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
2091 defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
2093 defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
2094 defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>;
2095 defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
2097 defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
2099 defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
2100 defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
2103 defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
2104 defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
2105 defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
2106 defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
2107 defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>;
2110 defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
2111 int_x86_sse2_psll_w, int_x86_sse2_pslli_w>;
2112 defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
2113 int_x86_sse2_psll_d, int_x86_sse2_pslli_d>;
2114 defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
2115 int_x86_sse2_psll_q, int_x86_sse2_pslli_q>;
2117 defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
2118 int_x86_sse2_psrl_w, int_x86_sse2_psrli_w>;
2119 defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
2120 int_x86_sse2_psrl_d, int_x86_sse2_psrli_d>;
2121 defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
2122 int_x86_sse2_psrl_q, int_x86_sse2_psrli_q>;
2124 defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
2125 int_x86_sse2_psra_w, int_x86_sse2_psrai_w>;
2126 defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
2127 int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
2129 // 128-bit logical shifts.
2130 let Constraints = "$src1 = $dst", neverHasSideEffects = 1,
2131 ExeDomain = SSEPackedInt in {
2132 def PSLLDQri : PDIi8<0x73, MRM7r,
2133 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
2134 "pslldq\t{$src2, $dst|$dst, $src2}", []>;
2135 def PSRLDQri : PDIi8<0x73, MRM3r,
2136 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
2137 "psrldq\t{$src2, $dst|$dst, $src2}", []>;
2138 // PSRADQri doesn't exist in SSE[1-3].
2141 let Predicates = [HasSSE2] in {
2142 def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
2143 (v2i64 (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
2144 def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
2145 (v2i64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
2146 def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
2147 (v2i64 (PSLLDQri VR128:$src1, imm:$src2))>;
2148 def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
2149 (v2i64 (PSRLDQri VR128:$src1, imm:$src2))>;
2150 def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
2151 (v2f64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
2153 // Shift up / down and insert zero's.
2154 def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))),
2155 (v2i64 (PSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
2156 def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))),
2157 (v2i64 (PSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
2161 defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
2162 defm POR : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
2163 defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
2165 let Constraints = "$src1 = $dst", ExeDomain = SSEPackedInt in {
2166 def PANDNrr : PDI<0xDF, MRMSrcReg,
2167 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2168 "pandn\t{$src2, $dst|$dst, $src2}",
2169 [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
2172 def PANDNrm : PDI<0xDF, MRMSrcMem,
2173 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2174 "pandn\t{$src2, $dst|$dst, $src2}",
2175 [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
2176 (memopv2i64 addr:$src2))))]>;
2179 // SSE2 Integer comparison
2180 defm PCMPEQB : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>;
2181 defm PCMPEQW : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>;
2182 defm PCMPEQD : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>;
2183 defm PCMPGTB : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
2184 defm PCMPGTW : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
2185 defm PCMPGTD : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
2187 def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
2188 (PCMPEQBrr VR128:$src1, VR128:$src2)>;
2189 def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
2190 (PCMPEQBrm VR128:$src1, addr:$src2)>;
2191 def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
2192 (PCMPEQWrr VR128:$src1, VR128:$src2)>;
2193 def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
2194 (PCMPEQWrm VR128:$src1, addr:$src2)>;
2195 def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
2196 (PCMPEQDrr VR128:$src1, VR128:$src2)>;
2197 def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
2198 (PCMPEQDrm VR128:$src1, addr:$src2)>;
2200 def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
2201 (PCMPGTBrr VR128:$src1, VR128:$src2)>;
2202 def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
2203 (PCMPGTBrm VR128:$src1, addr:$src2)>;
2204 def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
2205 (PCMPGTWrr VR128:$src1, VR128:$src2)>;
2206 def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
2207 (PCMPGTWrm VR128:$src1, addr:$src2)>;
2208 def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
2209 (PCMPGTDrr VR128:$src1, VR128:$src2)>;
2210 def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
2211 (PCMPGTDrm VR128:$src1, addr:$src2)>;
2214 // Pack instructions
2215 defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
2216 defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
2217 defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
2219 let ExeDomain = SSEPackedInt in {
2221 // Shuffle and unpack instructions
2222 let AddedComplexity = 5 in {
2223 def PSHUFDri : PDIi8<0x70, MRMSrcReg,
2224 (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
2225 "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2226 [(set VR128:$dst, (v4i32 (pshufd:$src2
2227 VR128:$src1, (undef))))]>;
2228 def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
2229 (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
2230 "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2231 [(set VR128:$dst, (v4i32 (pshufd:$src2
2232 (bc_v4i32 (memopv2i64 addr:$src1)),
2236 // SSE2 with ImmT == Imm8 and XS prefix.
2237 def PSHUFHWri : Ii8<0x70, MRMSrcReg,
2238 (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
2239 "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2240 [(set VR128:$dst, (v8i16 (pshufhw:$src2 VR128:$src1,
2242 XS, Requires<[HasSSE2]>;
2243 def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
2244 (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
2245 "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2246 [(set VR128:$dst, (v8i16 (pshufhw:$src2
2247 (bc_v8i16 (memopv2i64 addr:$src1)),
2249 XS, Requires<[HasSSE2]>;
2251 // SSE2 with ImmT == Imm8 and XD prefix.
2252 def PSHUFLWri : Ii8<0x70, MRMSrcReg,
2253 (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
2254 "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2255 [(set VR128:$dst, (v8i16 (pshuflw:$src2 VR128:$src1,
2257 XD, Requires<[HasSSE2]>;
2258 def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
2259 (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
2260 "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2261 [(set VR128:$dst, (v8i16 (pshuflw:$src2
2262 (bc_v8i16 (memopv2i64 addr:$src1)),
2264 XD, Requires<[HasSSE2]>;
2266 // Unpack instructions
2267 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
2268 PatFrag unp_frag, PatFrag bc_frag> {
2269 def rr : PDI<opc, MRMSrcReg,
2270 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2271 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
2272 [(set VR128:$dst, (vt (unp_frag VR128:$src1, VR128:$src2)))]>;
2273 def rm : PDI<opc, MRMSrcMem,
2274 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2275 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
2276 [(set VR128:$dst, (unp_frag VR128:$src1,
2277 (bc_frag (memopv2i64
2281 let Constraints = "$src1 = $dst" in {
2282 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, unpckl, bc_v16i8>;
2283 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, unpckl, bc_v8i16>;
2284 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, unpckl, bc_v4i32>;
2286 /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
2287 /// knew to collapse (bitconvert VT to VT) into its operand.
2288 def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
2289 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2290 "punpcklqdq\t{$src2, $dst|$dst, $src2}",
2292 (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>;
2293 def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
2294 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2295 "punpcklqdq\t{$src2, $dst|$dst, $src2}",
2297 (v2i64 (unpckl VR128:$src1,
2298 (memopv2i64 addr:$src2))))]>;
2300 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, unpckh, bc_v16i8>;
2301 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, unpckh, bc_v8i16>;
2302 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, unpckh, bc_v4i32>;
2304 /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
2305 /// knew to collapse (bitconvert VT to VT) into its operand.
2306 def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
2307 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2308 "punpckhqdq\t{$src2, $dst|$dst, $src2}",
2310 (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>;
2311 def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
2312 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
2313 "punpckhqdq\t{$src2, $dst|$dst, $src2}",
2315 (v2i64 (unpckh VR128:$src1,
2316 (memopv2i64 addr:$src2))))]>;
2320 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
2321 (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
2322 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2323 [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
2325 let Constraints = "$src1 = $dst" in {
2326 def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
2327 (outs VR128:$dst), (ins VR128:$src1,
2328 GR32:$src2, i32i8imm:$src3),
2329 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2331 (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
2332 def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
2333 (outs VR128:$dst), (ins VR128:$src1,
2334 i16mem:$src2, i32i8imm:$src3),
2335 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2337 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
2342 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
2343 "pmovmskb\t{$src, $dst|$dst, $src}",
2344 [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
2346 // Conditional store
2348 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
2349 "maskmovdqu\t{$mask, $src|$src, $mask}",
2350 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
2353 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
2354 "maskmovdqu\t{$mask, $src|$src, $mask}",
2355 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
2357 } // ExeDomain = SSEPackedInt
2359 // Non-temporal stores
2360 def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
2361 "movntpd\t{$src, $dst|$dst, $src}",
2362 [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
2363 let ExeDomain = SSEPackedInt in
2364 def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
2365 "movntdq\t{$src, $dst|$dst, $src}",
2366 [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
2367 def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
2368 "movnti\t{$src, $dst|$dst, $src}",
2369 [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
2370 TB, Requires<[HasSSE2]>;
2372 let AddedComplexity = 400 in { // Prefer non-temporal versions
2373 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
2374 "movntpd\t{$src, $dst|$dst, $src}",
2375 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
2377 let ExeDomain = SSEPackedInt in
2378 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
2379 "movntdq\t{$src, $dst|$dst, $src}",
2380 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
2384 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
2385 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
2386 TB, Requires<[HasSSE2]>;
2388 // Load, store, and memory fence
2389 def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
2390 "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
2391 def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
2392 "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
2394 // Pause. This "instruction" is encoded as "rep; nop", so even though it
2395 // was introduced with SSE2, it's backward compatible.
2396 def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
2398 //TODO: custom lower this so as to never even generate the noop
2399 def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
2401 def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
2402 def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
2403 def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
2406 // Alias instructions that map zero vector to pxor / xorp* for sse.
2407 // We set canFoldAsLoad because this can be converted to a constant-pool
2408 // load of an all-ones value if folding it would be beneficial.
2409 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
2410 isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
2411 // FIXME: Change encoding to pseudo.
2412 def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
2413 [(set VR128:$dst, (v4i32 immAllOnesV))]>;
2415 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
2416 "movd\t{$src, $dst|$dst, $src}",
2418 (v4i32 (scalar_to_vector GR32:$src)))]>;
2419 def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
2420 "movd\t{$src, $dst|$dst, $src}",
2422 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
2424 def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
2425 "movd\t{$src, $dst|$dst, $src}",
2426 [(set FR32:$dst, (bitconvert GR32:$src))]>;
2428 def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
2429 "movd\t{$src, $dst|$dst, $src}",
2430 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
2432 // SSE2 instructions with XS prefix
2433 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2434 "movq\t{$src, $dst|$dst, $src}",
2436 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
2437 Requires<[HasSSE2]>;
2438 def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
2439 "movq\t{$src, $dst|$dst, $src}",
2440 [(store (i64 (vector_extract (v2i64 VR128:$src),
2441 (iPTR 0))), addr:$dst)]>;
2443 def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
2444 (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
2446 def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
2447 "movd\t{$src, $dst|$dst, $src}",
2448 [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
2450 def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
2451 "movd\t{$src, $dst|$dst, $src}",
2452 [(store (i32 (vector_extract (v4i32 VR128:$src),
2453 (iPTR 0))), addr:$dst)]>;
2455 def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
2456 "movd\t{$src, $dst|$dst, $src}",
2457 [(set GR32:$dst, (bitconvert FR32:$src))]>;
2458 def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
2459 "movd\t{$src, $dst|$dst, $src}",
2460 [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
2462 // Store / copy lower 64-bits of a XMM register.
2463 def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
2464 "movq\t{$src, $dst|$dst, $src}",
2465 [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
2467 // movd / movq to XMM register zero-extends
2468 let AddedComplexity = 15 in {
2469 def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
2470 "movd\t{$src, $dst|$dst, $src}",
2471 [(set VR128:$dst, (v4i32 (X86vzmovl
2472 (v4i32 (scalar_to_vector GR32:$src)))))]>;
2473 // This is X86-64 only.
2474 def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
2475 "mov{d|q}\t{$src, $dst|$dst, $src}",
2476 [(set VR128:$dst, (v2i64 (X86vzmovl
2477 (v2i64 (scalar_to_vector GR64:$src)))))]>;
2480 let AddedComplexity = 20 in {
2481 def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
2482 "movd\t{$src, $dst|$dst, $src}",
2484 (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
2485 (loadi32 addr:$src))))))]>;
2487 def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
2488 (MOVZDI2PDIrm addr:$src)>;
2489 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
2490 (MOVZDI2PDIrm addr:$src)>;
2491 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
2492 (MOVZDI2PDIrm addr:$src)>;
2494 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2495 "movq\t{$src, $dst|$dst, $src}",
2497 (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
2498 (loadi64 addr:$src))))))]>, XS,
2499 Requires<[HasSSE2]>;
2501 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
2502 (MOVZQI2PQIrm addr:$src)>;
2503 def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
2504 (MOVZQI2PQIrm addr:$src)>;
2505 def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
2508 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
2509 // IA32 document. movq xmm1, xmm2 does clear the high bits.
2510 let AddedComplexity = 15 in
2511 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2512 "movq\t{$src, $dst|$dst, $src}",
2513 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
2514 XS, Requires<[HasSSE2]>;
2516 let AddedComplexity = 20 in {
2517 def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
2518 "movq\t{$src, $dst|$dst, $src}",
2519 [(set VR128:$dst, (v2i64 (X86vzmovl
2520 (loadv2i64 addr:$src))))]>,
2521 XS, Requires<[HasSSE2]>;
2523 def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
2524 (MOVZPQILo2PQIrm addr:$src)>;
2527 // Instructions for the disassembler
2528 // xr = XMM register
2531 def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2532 "movq\t{$src, $dst|$dst, $src}", []>, XS;
2534 //===---------------------------------------------------------------------===//
2535 // SSE3 Instructions
2536 //===---------------------------------------------------------------------===//
2538 // Move Instructions
2539 def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2540 "movshdup\t{$src, $dst|$dst, $src}",
2541 [(set VR128:$dst, (v4f32 (movshdup
2542 VR128:$src, (undef))))]>;
2543 def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2544 "movshdup\t{$src, $dst|$dst, $src}",
2545 [(set VR128:$dst, (movshdup
2546 (memopv4f32 addr:$src), (undef)))]>;
2548 def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2549 "movsldup\t{$src, $dst|$dst, $src}",
2550 [(set VR128:$dst, (v4f32 (movsldup
2551 VR128:$src, (undef))))]>;
2552 def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2553 "movsldup\t{$src, $dst|$dst, $src}",
2554 [(set VR128:$dst, (movsldup
2555 (memopv4f32 addr:$src), (undef)))]>;
2557 def MOVDDUPrr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2558 "movddup\t{$src, $dst|$dst, $src}",
2559 [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>;
2560 def MOVDDUPrm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2561 "movddup\t{$src, $dst|$dst, $src}",
2563 (v2f64 (movddup (scalar_to_vector (loadf64 addr:$src)),
2566 def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
2568 (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
2570 let AddedComplexity = 5 in {
2571 def : Pat<(movddup (memopv2f64 addr:$src), (undef)),
2572 (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
2573 def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
2574 (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
2575 def : Pat<(movddup (memopv2i64 addr:$src), (undef)),
2576 (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
2577 def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
2578 (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
2582 let Constraints = "$src1 = $dst" in {
2583 def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg,
2584 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2585 "addsubps\t{$src2, $dst|$dst, $src2}",
2586 [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
2588 def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem,
2589 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
2590 "addsubps\t{$src2, $dst|$dst, $src2}",
2591 [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
2592 (memop addr:$src2)))]>;
2593 def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
2594 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2595 "addsubpd\t{$src2, $dst|$dst, $src2}",
2596 [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
2598 def ADDSUBPDrm : S3I<0xD0, MRMSrcMem,
2599 (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
2600 "addsubpd\t{$src2, $dst|$dst, $src2}",
2601 [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
2602 (memop addr:$src2)))]>;
2605 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
2606 "lddqu\t{$src, $dst|$dst, $src}",
2607 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
2610 class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
2611 : S3DI<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2612 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2613 [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
2614 class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
2615 : S3DI<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
2616 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2617 [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (memop addr:$src2))))]>;
2618 class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
2619 : S3I<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2620 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2621 [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
2622 class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
2623 : S3I<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
2624 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2625 [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (memopv2f64 addr:$src2))))]>;
2627 let Constraints = "$src1 = $dst" in {
2628 def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
2629 def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
2630 def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
2631 def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
2632 def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
2633 def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
2634 def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
2635 def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
2638 // Thread synchronization
2639 def MONITOR : I<0x01, MRM_C8, (outs), (ins), "monitor",
2640 [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
2641 def MWAIT : I<0x01, MRM_C9, (outs), (ins), "mwait",
2642 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
2644 // vector_shuffle v1, <undef> <1, 1, 3, 3>
2645 let AddedComplexity = 15 in
2646 def : Pat<(v4i32 (movshdup VR128:$src, (undef))),
2647 (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
2648 let AddedComplexity = 20 in
2649 def : Pat<(v4i32 (movshdup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
2650 (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
2652 // vector_shuffle v1, <undef> <0, 0, 2, 2>
2653 let AddedComplexity = 15 in
2654 def : Pat<(v4i32 (movsldup VR128:$src, (undef))),
2655 (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
2656 let AddedComplexity = 20 in
2657 def : Pat<(v4i32 (movsldup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
2658 (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
2660 //===---------------------------------------------------------------------===//
2661 // SSSE3 Instructions
2662 //===---------------------------------------------------------------------===//
2664 /// SS3I_unop_rm_int_8 - Simple SSSE3 unary operator whose type is v*i8.
2665 multiclass SS3I_unop_rm_int_8<bits<8> opc, string OpcodeStr,
2666 Intrinsic IntId64, Intrinsic IntId128> {
2667 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
2668 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2669 [(set VR64:$dst, (IntId64 VR64:$src))]>;
2671 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
2672 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2674 (IntId64 (bitconvert (memopv8i8 addr:$src))))]>;
2676 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2678 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2679 [(set VR128:$dst, (IntId128 VR128:$src))]>,
2682 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2684 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2687 (bitconvert (memopv16i8 addr:$src))))]>, OpSize;
2690 /// SS3I_unop_rm_int_16 - Simple SSSE3 unary operator whose type is v*i16.
2691 multiclass SS3I_unop_rm_int_16<bits<8> opc, string OpcodeStr,
2692 Intrinsic IntId64, Intrinsic IntId128> {
2693 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
2695 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2696 [(set VR64:$dst, (IntId64 VR64:$src))]>;
2698 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
2700 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2703 (bitconvert (memopv4i16 addr:$src))))]>;
2705 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2707 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2708 [(set VR128:$dst, (IntId128 VR128:$src))]>,
2711 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2713 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2716 (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
2719 /// SS3I_unop_rm_int_32 - Simple SSSE3 unary operator whose type is v*i32.
2720 multiclass SS3I_unop_rm_int_32<bits<8> opc, string OpcodeStr,
2721 Intrinsic IntId64, Intrinsic IntId128> {
2722 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
2724 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2725 [(set VR64:$dst, (IntId64 VR64:$src))]>;
2727 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
2729 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2732 (bitconvert (memopv2i32 addr:$src))))]>;
2734 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2736 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2737 [(set VR128:$dst, (IntId128 VR128:$src))]>,
2740 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2742 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2745 (bitconvert (memopv4i32 addr:$src))))]>, OpSize;
2748 defm PABSB : SS3I_unop_rm_int_8 <0x1C, "pabsb",
2749 int_x86_ssse3_pabs_b,
2750 int_x86_ssse3_pabs_b_128>;
2751 defm PABSW : SS3I_unop_rm_int_16<0x1D, "pabsw",
2752 int_x86_ssse3_pabs_w,
2753 int_x86_ssse3_pabs_w_128>;
2754 defm PABSD : SS3I_unop_rm_int_32<0x1E, "pabsd",
2755 int_x86_ssse3_pabs_d,
2756 int_x86_ssse3_pabs_d_128>;
2758 /// SS3I_binop_rm_int_8 - Simple SSSE3 binary operator whose type is v*i8.
2759 let Constraints = "$src1 = $dst" in {
2760 multiclass SS3I_binop_rm_int_8<bits<8> opc, string OpcodeStr,
2761 Intrinsic IntId64, Intrinsic IntId128,
2762 bit Commutable = 0> {
2763 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
2764 (ins VR64:$src1, VR64:$src2),
2765 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2766 [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
2767 let isCommutable = Commutable;
2769 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
2770 (ins VR64:$src1, i64mem:$src2),
2771 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2773 (IntId64 VR64:$src1,
2774 (bitconvert (memopv8i8 addr:$src2))))]>;
2776 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2777 (ins VR128:$src1, VR128:$src2),
2778 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2779 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
2781 let isCommutable = Commutable;
2783 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2784 (ins VR128:$src1, i128mem:$src2),
2785 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2787 (IntId128 VR128:$src1,
2788 (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
2792 /// SS3I_binop_rm_int_16 - Simple SSSE3 binary operator whose type is v*i16.
2793 let Constraints = "$src1 = $dst" in {
2794 multiclass SS3I_binop_rm_int_16<bits<8> opc, string OpcodeStr,
2795 Intrinsic IntId64, Intrinsic IntId128,
2796 bit Commutable = 0> {
2797 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
2798 (ins VR64:$src1, VR64:$src2),
2799 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2800 [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
2801 let isCommutable = Commutable;
2803 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
2804 (ins VR64:$src1, i64mem:$src2),
2805 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2807 (IntId64 VR64:$src1,
2808 (bitconvert (memopv4i16 addr:$src2))))]>;
2810 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2811 (ins VR128:$src1, VR128:$src2),
2812 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2813 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
2815 let isCommutable = Commutable;
2817 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2818 (ins VR128:$src1, i128mem:$src2),
2819 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2821 (IntId128 VR128:$src1,
2822 (bitconvert (memopv8i16 addr:$src2))))]>, OpSize;
2826 /// SS3I_binop_rm_int_32 - Simple SSSE3 binary operator whose type is v*i32.
2827 let Constraints = "$src1 = $dst" in {
2828 multiclass SS3I_binop_rm_int_32<bits<8> opc, string OpcodeStr,
2829 Intrinsic IntId64, Intrinsic IntId128,
2830 bit Commutable = 0> {
2831 def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
2832 (ins VR64:$src1, VR64:$src2),
2833 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2834 [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
2835 let isCommutable = Commutable;
2837 def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
2838 (ins VR64:$src1, i64mem:$src2),
2839 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2841 (IntId64 VR64:$src1,
2842 (bitconvert (memopv2i32 addr:$src2))))]>;
2844 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
2845 (ins VR128:$src1, VR128:$src2),
2846 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2847 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
2849 let isCommutable = Commutable;
2851 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
2852 (ins VR128:$src1, i128mem:$src2),
2853 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2855 (IntId128 VR128:$src1,
2856 (bitconvert (memopv4i32 addr:$src2))))]>, OpSize;
2860 let ImmT = NoImm in { // None of these have i8 immediate fields.
2861 defm PHADDW : SS3I_binop_rm_int_16<0x01, "phaddw",
2862 int_x86_ssse3_phadd_w,
2863 int_x86_ssse3_phadd_w_128>;
2864 defm PHADDD : SS3I_binop_rm_int_32<0x02, "phaddd",
2865 int_x86_ssse3_phadd_d,
2866 int_x86_ssse3_phadd_d_128>;
2867 defm PHADDSW : SS3I_binop_rm_int_16<0x03, "phaddsw",
2868 int_x86_ssse3_phadd_sw,
2869 int_x86_ssse3_phadd_sw_128>;
2870 defm PHSUBW : SS3I_binop_rm_int_16<0x05, "phsubw",
2871 int_x86_ssse3_phsub_w,
2872 int_x86_ssse3_phsub_w_128>;
2873 defm PHSUBD : SS3I_binop_rm_int_32<0x06, "phsubd",
2874 int_x86_ssse3_phsub_d,
2875 int_x86_ssse3_phsub_d_128>;
2876 defm PHSUBSW : SS3I_binop_rm_int_16<0x07, "phsubsw",
2877 int_x86_ssse3_phsub_sw,
2878 int_x86_ssse3_phsub_sw_128>;
2879 defm PMADDUBSW : SS3I_binop_rm_int_8 <0x04, "pmaddubsw",
2880 int_x86_ssse3_pmadd_ub_sw,
2881 int_x86_ssse3_pmadd_ub_sw_128>;
2882 defm PMULHRSW : SS3I_binop_rm_int_16<0x0B, "pmulhrsw",
2883 int_x86_ssse3_pmul_hr_sw,
2884 int_x86_ssse3_pmul_hr_sw_128, 1>;
2886 defm PSHUFB : SS3I_binop_rm_int_8 <0x00, "pshufb",
2887 int_x86_ssse3_pshuf_b,
2888 int_x86_ssse3_pshuf_b_128>;
2889 defm PSIGNB : SS3I_binop_rm_int_8 <0x08, "psignb",
2890 int_x86_ssse3_psign_b,
2891 int_x86_ssse3_psign_b_128>;
2892 defm PSIGNW : SS3I_binop_rm_int_16<0x09, "psignw",
2893 int_x86_ssse3_psign_w,
2894 int_x86_ssse3_psign_w_128>;
2895 defm PSIGND : SS3I_binop_rm_int_32<0x0A, "psignd",
2896 int_x86_ssse3_psign_d,
2897 int_x86_ssse3_psign_d_128>;
2900 // palignr patterns.
2901 let Constraints = "$src1 = $dst" in {
2902 def PALIGNR64rr : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
2903 (ins VR64:$src1, VR64:$src2, i8imm:$src3),
2904 "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2906 def PALIGNR64rm : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
2907 (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
2908 "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2911 def PALIGNR128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
2912 (ins VR128:$src1, VR128:$src2, i8imm:$src3),
2913 "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2915 def PALIGNR128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
2916 (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
2917 "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2921 let AddedComplexity = 5 in {
2923 def : Pat<(v1i64 (palign:$src3 VR64:$src1, VR64:$src2)),
2924 (PALIGNR64rr VR64:$src2, VR64:$src1,
2925 (SHUFFLE_get_palign_imm VR64:$src3))>,
2926 Requires<[HasSSSE3]>;
2927 def : Pat<(v2i32 (palign:$src3 VR64:$src1, VR64:$src2)),
2928 (PALIGNR64rr VR64:$src2, VR64:$src1,
2929 (SHUFFLE_get_palign_imm VR64:$src3))>,
2930 Requires<[HasSSSE3]>;
2931 def : Pat<(v2f32 (palign:$src3 VR64:$src1, VR64:$src2)),
2932 (PALIGNR64rr VR64:$src2, VR64:$src1,
2933 (SHUFFLE_get_palign_imm VR64:$src3))>,
2934 Requires<[HasSSSE3]>;
2935 def : Pat<(v4i16 (palign:$src3 VR64:$src1, VR64:$src2)),
2936 (PALIGNR64rr VR64:$src2, VR64:$src1,
2937 (SHUFFLE_get_palign_imm VR64:$src3))>,
2938 Requires<[HasSSSE3]>;
2939 def : Pat<(v8i8 (palign:$src3 VR64:$src1, VR64:$src2)),
2940 (PALIGNR64rr VR64:$src2, VR64:$src1,
2941 (SHUFFLE_get_palign_imm VR64:$src3))>,
2942 Requires<[HasSSSE3]>;
2944 def : Pat<(v4i32 (palign:$src3 VR128:$src1, VR128:$src2)),
2945 (PALIGNR128rr VR128:$src2, VR128:$src1,
2946 (SHUFFLE_get_palign_imm VR128:$src3))>,
2947 Requires<[HasSSSE3]>;
2948 def : Pat<(v4f32 (palign:$src3 VR128:$src1, VR128:$src2)),
2949 (PALIGNR128rr VR128:$src2, VR128:$src1,
2950 (SHUFFLE_get_palign_imm VR128:$src3))>,
2951 Requires<[HasSSSE3]>;
2952 def : Pat<(v8i16 (palign:$src3 VR128:$src1, VR128:$src2)),
2953 (PALIGNR128rr VR128:$src2, VR128:$src1,
2954 (SHUFFLE_get_palign_imm VR128:$src3))>,
2955 Requires<[HasSSSE3]>;
2956 def : Pat<(v16i8 (palign:$src3 VR128:$src1, VR128:$src2)),
2957 (PALIGNR128rr VR128:$src2, VR128:$src1,
2958 (SHUFFLE_get_palign_imm VR128:$src3))>,
2959 Requires<[HasSSSE3]>;
2962 def : Pat<(X86pshufb VR128:$src, VR128:$mask),
2963 (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
2964 def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
2965 (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
2967 //===---------------------------------------------------------------------===//
2968 // Non-Instruction Patterns
2969 //===---------------------------------------------------------------------===//
2971 // extload f32 -> f64. This matches load+fextend because we have a hack in
2972 // the isel (PreprocessForFPConvert) that can introduce loads after dag
2974 // Since these loads aren't folded into the fextend, we have to match it
2976 let Predicates = [HasSSE2] in
2977 def : Pat<(fextend (loadf32 addr:$src)),
2978 (CVTSS2SDrm addr:$src)>;
2981 let Predicates = [HasSSE2] in {
2982 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
2983 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
2984 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
2985 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
2986 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
2987 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
2988 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
2989 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
2990 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
2991 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
2992 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
2993 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
2994 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
2995 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
2996 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
2997 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
2998 def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
2999 def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
3000 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
3001 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
3002 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
3003 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
3004 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
3005 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
3006 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
3007 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
3008 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
3009 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
3010 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
3011 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
3014 // Move scalar to XMM zero-extended
3015 // movd to XMM register zero-extends
3016 let AddedComplexity = 15 in {
3017 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
3018 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
3019 (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>;
3020 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
3021 (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>;
3022 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
3023 (MOVSSrr (v4f32 (V_SET0PS)),
3024 (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
3025 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
3026 (MOVSSrr (v4i32 (V_SET0PI)),
3027 (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
3030 // Splat v2f64 / v2i64
3031 let AddedComplexity = 10 in {
3032 def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
3033 (UNPCKLPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
3034 def : Pat<(unpckh (v2f64 VR128:$src), (undef)),
3035 (UNPCKHPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
3036 def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
3037 (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
3038 def : Pat<(unpckh (v2i64 VR128:$src), (undef)),
3039 (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
3042 // Special unary SHUFPSrri case.
3043 def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
3044 (SHUFPSrri VR128:$src1, VR128:$src1,
3045 (SHUFFLE_get_shuf_imm VR128:$src3))>;
3046 let AddedComplexity = 5 in
3047 def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
3048 (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
3049 Requires<[HasSSE2]>;
3050 // Special unary SHUFPDrri case.
3051 def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
3052 (SHUFPDrri VR128:$src1, VR128:$src1,
3053 (SHUFFLE_get_shuf_imm VR128:$src3))>,
3054 Requires<[HasSSE2]>;
3055 // Special unary SHUFPDrri case.
3056 def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
3057 (SHUFPDrri VR128:$src1, VR128:$src1,
3058 (SHUFFLE_get_shuf_imm VR128:$src3))>,
3059 Requires<[HasSSE2]>;
3060 // Unary v4f32 shuffle with PSHUF* in order to fold a load.
3061 def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
3062 (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
3063 Requires<[HasSSE2]>;
3065 // Special binary v4i32 shuffle cases with SHUFPS.
3066 def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
3067 (SHUFPSrri VR128:$src1, VR128:$src2,
3068 (SHUFFLE_get_shuf_imm VR128:$src3))>,
3069 Requires<[HasSSE2]>;
3070 def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
3071 (SHUFPSrmi VR128:$src1, addr:$src2,
3072 (SHUFFLE_get_shuf_imm VR128:$src3))>,
3073 Requires<[HasSSE2]>;
3074 // Special binary v2i64 shuffle cases using SHUFPDrri.
3075 def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
3076 (SHUFPDrri VR128:$src1, VR128:$src2,
3077 (SHUFFLE_get_shuf_imm VR128:$src3))>,
3078 Requires<[HasSSE2]>;
3080 // vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
3081 let AddedComplexity = 15 in {
3082 def : Pat<(v4i32 (unpckl_undef:$src2 VR128:$src, (undef))),
3083 (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
3084 Requires<[OptForSpeed, HasSSE2]>;
3085 def : Pat<(v4f32 (unpckl_undef:$src2 VR128:$src, (undef))),
3086 (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
3087 Requires<[OptForSpeed, HasSSE2]>;
3089 let AddedComplexity = 10 in {
3090 def : Pat<(v4f32 (unpckl_undef VR128:$src, (undef))),
3091 (UNPCKLPSrr VR128:$src, VR128:$src)>;
3092 def : Pat<(v16i8 (unpckl_undef VR128:$src, (undef))),
3093 (PUNPCKLBWrr VR128:$src, VR128:$src)>;
3094 def : Pat<(v8i16 (unpckl_undef VR128:$src, (undef))),
3095 (PUNPCKLWDrr VR128:$src, VR128:$src)>;
3096 def : Pat<(v4i32 (unpckl_undef VR128:$src, (undef))),
3097 (PUNPCKLDQrr VR128:$src, VR128:$src)>;
3100 // vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
3101 let AddedComplexity = 15 in {
3102 def : Pat<(v4i32 (unpckh_undef:$src2 VR128:$src, (undef))),
3103 (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
3104 Requires<[OptForSpeed, HasSSE2]>;
3105 def : Pat<(v4f32 (unpckh_undef:$src2 VR128:$src, (undef))),
3106 (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
3107 Requires<[OptForSpeed, HasSSE2]>;
3109 let AddedComplexity = 10 in {
3110 def : Pat<(v4f32 (unpckh_undef VR128:$src, (undef))),
3111 (UNPCKHPSrr VR128:$src, VR128:$src)>;
3112 def : Pat<(v16i8 (unpckh_undef VR128:$src, (undef))),
3113 (PUNPCKHBWrr VR128:$src, VR128:$src)>;
3114 def : Pat<(v8i16 (unpckh_undef VR128:$src, (undef))),
3115 (PUNPCKHWDrr VR128:$src, VR128:$src)>;
3116 def : Pat<(v4i32 (unpckh_undef VR128:$src, (undef))),
3117 (PUNPCKHDQrr VR128:$src, VR128:$src)>;
3120 let AddedComplexity = 20 in {
3121 // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
3122 def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
3123 (MOVLHPSrr VR128:$src1, VR128:$src2)>;
3125 // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
3126 def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
3127 (MOVHLPSrr VR128:$src1, VR128:$src2)>;
3129 // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
3130 def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
3131 (MOVHLPSrr VR128:$src1, VR128:$src1)>;
3132 def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
3133 (MOVHLPSrr VR128:$src1, VR128:$src1)>;
3136 let AddedComplexity = 20 in {
3137 // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
3138 def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
3139 (MOVLPSrm VR128:$src1, addr:$src2)>;
3140 def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
3141 (MOVLPDrm VR128:$src1, addr:$src2)>;
3142 def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
3143 (MOVLPSrm VR128:$src1, addr:$src2)>;
3144 def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
3145 (MOVLPDrm VR128:$src1, addr:$src2)>;
3148 // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
3149 def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
3150 (MOVLPSmr addr:$src1, VR128:$src2)>;
3151 def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
3152 (MOVLPDmr addr:$src1, VR128:$src2)>;
3153 def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
3155 (MOVLPSmr addr:$src1, VR128:$src2)>;
3156 def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
3157 (MOVLPDmr addr:$src1, VR128:$src2)>;
3159 let AddedComplexity = 15 in {
3160 // Setting the lowest element in the vector.
3161 def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
3162 (MOVSSrr (v4i32 VR128:$src1),
3163 (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
3164 def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
3165 (MOVSDrr (v2i64 VR128:$src1),
3166 (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
3168 // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
3169 def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
3170 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
3171 Requires<[HasSSE2]>;
3172 def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
3173 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
3174 Requires<[HasSSE2]>;
3177 // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
3178 // fall back to this for SSE1)
3179 def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
3180 (SHUFPSrri VR128:$src2, VR128:$src1,
3181 (SHUFFLE_get_shuf_imm VR128:$src3))>;
3183 // Set lowest element and zero upper elements.
3184 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
3185 (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
3187 // Some special case pandn patterns.
3188 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
3190 (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
3191 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
3193 (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
3194 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
3196 (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
3198 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
3199 (memop addr:$src2))),
3200 (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
3201 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
3202 (memop addr:$src2))),
3203 (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
3204 def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
3205 (memop addr:$src2))),
3206 (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
3208 // vector -> vector casts
3209 def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
3210 (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;
3211 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
3212 (Int_CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
3213 def : Pat<(v2f64 (sint_to_fp (v2i32 VR64:$src))),
3214 (Int_CVTPI2PDrr VR64:$src)>, Requires<[HasSSE2]>;
3215 def : Pat<(v2i32 (fp_to_sint (v2f64 VR128:$src))),
3216 (Int_CVTTPD2PIrr VR128:$src)>, Requires<[HasSSE2]>;
3218 // Use movaps / movups for SSE integer load / store (one byte shorter).
3219 def : Pat<(alignedloadv4i32 addr:$src),
3220 (MOVAPSrm addr:$src)>;
3221 def : Pat<(loadv4i32 addr:$src),
3222 (MOVUPSrm addr:$src)>;
3223 def : Pat<(alignedloadv2i64 addr:$src),
3224 (MOVAPSrm addr:$src)>;
3225 def : Pat<(loadv2i64 addr:$src),
3226 (MOVUPSrm addr:$src)>;
3228 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
3229 (MOVAPSmr addr:$dst, VR128:$src)>;
3230 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3231 (MOVAPSmr addr:$dst, VR128:$src)>;
3232 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3233 (MOVAPSmr addr:$dst, VR128:$src)>;
3234 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3235 (MOVAPSmr addr:$dst, VR128:$src)>;
3236 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
3237 (MOVUPSmr addr:$dst, VR128:$src)>;
3238 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3239 (MOVUPSmr addr:$dst, VR128:$src)>;
3240 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3241 (MOVUPSmr addr:$dst, VR128:$src)>;
3242 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3243 (MOVUPSmr addr:$dst, VR128:$src)>;
3245 //===----------------------------------------------------------------------===//
3246 // SSE4.1 Instructions
3247 //===----------------------------------------------------------------------===//
3249 multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
3252 Intrinsic V2F64Int> {
3253 // Intrinsic operation, reg.
3254 // Vector intrinsic operation, reg
3255 def PSr_Int : SS4AIi8<opcps, MRMSrcReg,
3256 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3257 !strconcat(OpcodeStr,
3258 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3259 [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]>,
3262 // Vector intrinsic operation, mem
3263 def PSm_Int : Ii8<opcps, MRMSrcMem,
3264 (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
3265 !strconcat(OpcodeStr,
3266 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3268 (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,
3270 Requires<[HasSSE41]>;
3272 // Vector intrinsic operation, reg
3273 def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
3274 (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3275 !strconcat(OpcodeStr,
3276 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3277 [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]>,
3280 // Vector intrinsic operation, mem
3281 def PDm_Int : SS4AIi8<opcpd, MRMSrcMem,
3282 (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
3283 !strconcat(OpcodeStr,
3284 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3286 (V2F64Int (memopv2f64 addr:$src1),imm:$src2))]>,
3290 let Constraints = "$src1 = $dst" in {
3291 multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
3295 // Intrinsic operation, reg.
3296 def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
3298 (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
3299 !strconcat(OpcodeStr,
3300 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3302 (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
3305 // Intrinsic operation, mem.
3306 def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
3308 (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
3309 !strconcat(OpcodeStr,
3310 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3312 (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
3315 // Intrinsic operation, reg.
3316 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
3318 (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
3319 !strconcat(OpcodeStr,
3320 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3322 (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
3325 // Intrinsic operation, mem.
3326 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
3328 (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
3329 !strconcat(OpcodeStr,
3330 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3332 (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
3337 // FP round - roundss, roundps, roundsd, roundpd
3338 defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round",
3339 int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
3340 defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round",
3341 int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
3343 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
3344 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
3345 Intrinsic IntId128> {
3346 def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3348 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3349 [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
3350 def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3352 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3355 (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
3358 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
3359 int_x86_sse41_phminposuw>;
3361 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
3362 let Constraints = "$src1 = $dst" in {
3363 multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
3364 Intrinsic IntId128, bit Commutable = 0> {
3365 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3366 (ins VR128:$src1, VR128:$src2),
3367 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3368 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
3370 let isCommutable = Commutable;
3372 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3373 (ins VR128:$src1, i128mem:$src2),
3374 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3376 (IntId128 VR128:$src1,
3377 (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
3381 defm PCMPEQQ : SS41I_binop_rm_int<0x29, "pcmpeqq",
3382 int_x86_sse41_pcmpeqq, 1>;
3383 defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw",
3384 int_x86_sse41_packusdw, 0>;
3385 defm PMINSB : SS41I_binop_rm_int<0x38, "pminsb",
3386 int_x86_sse41_pminsb, 1>;
3387 defm PMINSD : SS41I_binop_rm_int<0x39, "pminsd",
3388 int_x86_sse41_pminsd, 1>;
3389 defm PMINUD : SS41I_binop_rm_int<0x3B, "pminud",
3390 int_x86_sse41_pminud, 1>;
3391 defm PMINUW : SS41I_binop_rm_int<0x3A, "pminuw",
3392 int_x86_sse41_pminuw, 1>;
3393 defm PMAXSB : SS41I_binop_rm_int<0x3C, "pmaxsb",
3394 int_x86_sse41_pmaxsb, 1>;
3395 defm PMAXSD : SS41I_binop_rm_int<0x3D, "pmaxsd",
3396 int_x86_sse41_pmaxsd, 1>;
3397 defm PMAXUD : SS41I_binop_rm_int<0x3F, "pmaxud",
3398 int_x86_sse41_pmaxud, 1>;
3399 defm PMAXUW : SS41I_binop_rm_int<0x3E, "pmaxuw",
3400 int_x86_sse41_pmaxuw, 1>;
3402 defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, 1>;
3404 def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
3405 (PCMPEQQrr VR128:$src1, VR128:$src2)>;
3406 def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
3407 (PCMPEQQrm VR128:$src1, addr:$src2)>;
3409 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
3410 let Constraints = "$src1 = $dst" in {
3411 multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, ValueType OpVT,
3412 SDNode OpNode, Intrinsic IntId128,
3413 bit Commutable = 0> {
3414 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3415 (ins VR128:$src1, VR128:$src2),
3416 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3417 [(set VR128:$dst, (OpNode (OpVT VR128:$src1),
3418 VR128:$src2))]>, OpSize {
3419 let isCommutable = Commutable;
3421 def rr_int : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3422 (ins VR128:$src1, VR128:$src2),
3423 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3424 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
3426 let isCommutable = Commutable;
3428 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3429 (ins VR128:$src1, i128mem:$src2),
3430 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3432 (OpVT (OpNode VR128:$src1, (memop addr:$src2))))]>, OpSize;
3433 def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3434 (ins VR128:$src1, i128mem:$src2),
3435 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3437 (IntId128 VR128:$src1, (memop addr:$src2)))]>,
3442 /// SS48I_binop_rm - Simple SSE41 binary operator.
3443 let Constraints = "$src1 = $dst" in {
3444 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
3445 ValueType OpVT, bit Commutable = 0> {
3446 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3447 (ins VR128:$src1, VR128:$src2),
3448 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3449 [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>,
3451 let isCommutable = Commutable;
3453 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3454 (ins VR128:$src1, i128mem:$src2),
3455 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3456 [(set VR128:$dst, (OpNode VR128:$src1,
3457 (bc_v4i32 (memopv2i64 addr:$src2))))]>,
3462 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, 1>;
3464 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
3465 let Constraints = "$src1 = $dst" in {
3466 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
3467 Intrinsic IntId128, bit Commutable = 0> {
3468 def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
3469 (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
3470 !strconcat(OpcodeStr,
3471 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3473 (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
3475 let isCommutable = Commutable;
3477 def rmi : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
3478 (ins VR128:$src1, i128mem:$src2, i32i8imm:$src3),
3479 !strconcat(OpcodeStr,
3480 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3482 (IntId128 VR128:$src1,
3483 (bitconvert (memopv16i8 addr:$src2)), imm:$src3))]>,
3488 defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps",
3489 int_x86_sse41_blendps, 0>;
3490 defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd",
3491 int_x86_sse41_blendpd, 0>;
3492 defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw",
3493 int_x86_sse41_pblendw, 0>;
3494 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps",
3495 int_x86_sse41_dpps, 1>;
3496 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd",
3497 int_x86_sse41_dppd, 1>;
3498 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw",
3499 int_x86_sse41_mpsadbw, 0>;
3502 /// SS41I_ternary_int - SSE 4.1 ternary operator
3503 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
3504 multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
3505 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
3506 (ins VR128:$src1, VR128:$src2),
3507 !strconcat(OpcodeStr,
3508 "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
3509 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
3512 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
3513 (ins VR128:$src1, i128mem:$src2),
3514 !strconcat(OpcodeStr,
3515 "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
3518 (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
3522 defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
3523 defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
3524 defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
3527 multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
3528 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3529 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3530 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
3532 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
3533 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3535 (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
3539 defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
3540 defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
3541 defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
3542 defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
3543 defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
3544 defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
3546 // Common patterns involving scalar load.
3547 def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
3548 (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
3549 def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
3550 (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
3552 def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
3553 (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
3554 def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
3555 (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
3557 def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
3558 (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
3559 def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
3560 (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
3562 def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
3563 (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
3564 def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
3565 (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
3567 def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
3568 (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
3569 def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
3570 (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
3572 def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
3573 (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
3574 def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
3575 (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
3578 multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
3579 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3580 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3581 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
3583 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
3584 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3586 (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
3590 defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
3591 defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
3592 defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
3593 defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
3595 // Common patterns involving scalar load
3596 def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
3597 (PMOVSXBDrm addr:$src)>, Requires<[HasSSE41]>;
3598 def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
3599 (PMOVSXWQrm addr:$src)>, Requires<[HasSSE41]>;
3601 def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
3602 (PMOVZXBDrm addr:$src)>, Requires<[HasSSE41]>;
3603 def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
3604 (PMOVZXWQrm addr:$src)>, Requires<[HasSSE41]>;
3607 multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
3608 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3609 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3610 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
3612 // Expecting a i16 load any extended to i32 value.
3613 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
3614 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3615 [(set VR128:$dst, (IntId (bitconvert
3616 (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
3620 defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
3621 defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
3623 // Common patterns involving scalar load
3624 def : Pat<(int_x86_sse41_pmovsxbq
3625 (bitconvert (v4i32 (X86vzmovl
3626 (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
3627 (PMOVSXBQrm addr:$src)>, Requires<[HasSSE41]>;
3629 def : Pat<(int_x86_sse41_pmovzxbq
3630 (bitconvert (v4i32 (X86vzmovl
3631 (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
3632 (PMOVZXBQrm addr:$src)>, Requires<[HasSSE41]>;
3635 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
3636 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
3637 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
3638 (ins VR128:$src1, i32i8imm:$src2),
3639 !strconcat(OpcodeStr,
3640 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3641 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
3643 def mr : SS4AIi8<opc, MRMDestMem, (outs),
3644 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
3645 !strconcat(OpcodeStr,
3646 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3649 // There's an AssertZext in the way of writing the store pattern
3650 // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
3653 defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
3656 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
3657 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
3658 def mr : SS4AIi8<opc, MRMDestMem, (outs),
3659 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
3660 !strconcat(OpcodeStr,
3661 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3664 // There's an AssertZext in the way of writing the store pattern
3665 // (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
3668 defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
3671 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
3672 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
3673 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
3674 (ins VR128:$src1, i32i8imm:$src2),
3675 !strconcat(OpcodeStr,
3676 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3678 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
3679 def mr : SS4AIi8<opc, MRMDestMem, (outs),
3680 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
3681 !strconcat(OpcodeStr,
3682 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3683 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
3684 addr:$dst)]>, OpSize;
3687 defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
3690 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
3692 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
3693 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
3694 (ins VR128:$src1, i32i8imm:$src2),
3695 !strconcat(OpcodeStr,
3696 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3698 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
3700 def mr : SS4AIi8<opc, MRMDestMem, (outs),
3701 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
3702 !strconcat(OpcodeStr,
3703 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3704 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
3705 addr:$dst)]>, OpSize;
3708 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
3710 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
3711 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
3714 (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
3715 Requires<[HasSSE41]>;
3717 let Constraints = "$src1 = $dst" in {
3718 multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> {
3719 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
3720 (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
3721 !strconcat(OpcodeStr,
3722 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3724 (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
3725 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
3726 (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
3727 !strconcat(OpcodeStr,
3728 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3730 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
3731 imm:$src3))]>, OpSize;
3735 defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
3737 let Constraints = "$src1 = $dst" in {
3738 multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> {
3739 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
3740 (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
3741 !strconcat(OpcodeStr,
3742 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3744 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
3746 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
3747 (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
3748 !strconcat(OpcodeStr,
3749 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3751 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
3752 imm:$src3)))]>, OpSize;
3756 defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
3758 // insertps has a few different modes, there's the first two here below which
3759 // are optimized inserts that won't zero arbitrary elements in the destination
3760 // vector. The next one matches the intrinsic and could zero arbitrary elements
3761 // in the target vector.
3762 let Constraints = "$src1 = $dst" in {
3763 multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> {
3764 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
3765 (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
3766 !strconcat(OpcodeStr,
3767 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3769 (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
3771 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
3772 (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
3773 !strconcat(OpcodeStr,
3774 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
3776 (X86insrtps VR128:$src1,
3777 (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
3778 imm:$src3))]>, OpSize;
3782 defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
3784 def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
3785 (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>;
3787 // ptest instruction we'll lower to this in X86ISelLowering primarily from
3788 // the intel intrinsic that corresponds to this.
3789 let Defs = [EFLAGS] in {
3790 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
3791 "ptest \t{$src2, $src1|$src1, $src2}",
3792 [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>,
3794 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
3795 "ptest \t{$src2, $src1|$src1, $src2}",
3796 [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>,
3800 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3801 "movntdqa\t{$src, $dst|$dst, $src}",
3802 [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
3806 //===----------------------------------------------------------------------===//
3807 // SSE4.2 Instructions
3808 //===----------------------------------------------------------------------===//
3810 /// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
3811 let Constraints = "$src1 = $dst" in {
3812 multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
3813 Intrinsic IntId128, bit Commutable = 0> {
3814 def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst),
3815 (ins VR128:$src1, VR128:$src2),
3816 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3817 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
3819 let isCommutable = Commutable;
3821 def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst),
3822 (ins VR128:$src1, i128mem:$src2),
3823 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3825 (IntId128 VR128:$src1,
3826 (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
3830 defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
3832 def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
3833 (PCMPGTQrr VR128:$src1, VR128:$src2)>;
3834 def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
3835 (PCMPGTQrm VR128:$src1, addr:$src2)>;
3837 // crc intrinsic instruction
3838 // This set of instructions are only rm, the only difference is the size
3840 let Constraints = "$src1 = $dst" in {
3841 def CRC32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
3842 (ins GR32:$src1, i8mem:$src2),
3843 "crc32{b} \t{$src2, $src1|$src1, $src2}",
3845 (int_x86_sse42_crc32_8 GR32:$src1,
3846 (load addr:$src2)))]>;
3847 def CRC32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
3848 (ins GR32:$src1, GR8:$src2),
3849 "crc32{b} \t{$src2, $src1|$src1, $src2}",
3851 (int_x86_sse42_crc32_8 GR32:$src1, GR8:$src2))]>;
3852 def CRC32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
3853 (ins GR32:$src1, i16mem:$src2),
3854 "crc32{w} \t{$src2, $src1|$src1, $src2}",
3856 (int_x86_sse42_crc32_16 GR32:$src1,
3857 (load addr:$src2)))]>,
3859 def CRC32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
3860 (ins GR32:$src1, GR16:$src2),
3861 "crc32{w} \t{$src2, $src1|$src1, $src2}",
3863 (int_x86_sse42_crc32_16 GR32:$src1, GR16:$src2))]>,
3865 def CRC32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
3866 (ins GR32:$src1, i32mem:$src2),
3867 "crc32{l} \t{$src2, $src1|$src1, $src2}",
3869 (int_x86_sse42_crc32_32 GR32:$src1,
3870 (load addr:$src2)))]>;
3871 def CRC32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
3872 (ins GR32:$src1, GR32:$src2),
3873 "crc32{l} \t{$src2, $src1|$src1, $src2}",
3875 (int_x86_sse42_crc32_32 GR32:$src1, GR32:$src2))]>;
3876 def CRC64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
3877 (ins GR64:$src1, i8mem:$src2),
3878 "crc32{b} \t{$src2, $src1|$src1, $src2}",
3880 (int_x86_sse42_crc64_8 GR64:$src1,
3881 (load addr:$src2)))]>,
3883 def CRC64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
3884 (ins GR64:$src1, GR8:$src2),
3885 "crc32{b} \t{$src2, $src1|$src1, $src2}",
3887 (int_x86_sse42_crc64_8 GR64:$src1, GR8:$src2))]>,
3889 def CRC64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
3890 (ins GR64:$src1, i64mem:$src2),
3891 "crc32{q} \t{$src2, $src1|$src1, $src2}",
3893 (int_x86_sse42_crc64_64 GR64:$src1,
3894 (load addr:$src2)))]>,
3896 def CRC64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
3897 (ins GR64:$src1, GR64:$src2),
3898 "crc32{q} \t{$src2, $src1|$src1, $src2}",
3900 (int_x86_sse42_crc64_64 GR64:$src1, GR64:$src2))]>,
3904 // String/text processing instructions.
3905 let Defs = [EFLAGS], usesCustomInserter = 1 in {
3906 def PCMPISTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
3907 (ins VR128:$src1, VR128:$src2, i8imm:$src3),
3908 "#PCMPISTRM128rr PSEUDO!",
3909 [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
3910 imm:$src3))]>, OpSize;
3911 def PCMPISTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
3912 (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
3913 "#PCMPISTRM128rm PSEUDO!",
3914 [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, (load addr:$src2),
3915 imm:$src3))]>, OpSize;
3918 let Defs = [XMM0, EFLAGS] in {
3919 def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
3920 (ins VR128:$src1, VR128:$src2, i8imm:$src3),
3921 "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
3922 def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
3923 (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
3924 "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
3927 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
3928 def PCMPESTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
3929 (ins VR128:$src1, VR128:$src3, i8imm:$src5),
3930 "#PCMPESTRM128rr PSEUDO!",
3932 (int_x86_sse42_pcmpestrm128
3933 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>, OpSize;
3935 def PCMPESTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
3936 (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
3937 "#PCMPESTRM128rm PSEUDO!",
3938 [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
3939 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>,
3943 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
3944 def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
3945 (ins VR128:$src1, VR128:$src3, i8imm:$src5),
3946 "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
3947 def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
3948 (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
3949 "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
3952 let Defs = [ECX, EFLAGS] in {
3953 multiclass SS42AI_pcmpistri<Intrinsic IntId128> {
3954 def rr : SS42AI<0x63, MRMSrcReg, (outs),
3955 (ins VR128:$src1, VR128:$src2, i8imm:$src3),
3956 "pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
3957 [(set ECX, (IntId128 VR128:$src1, VR128:$src2, imm:$src3)),
3958 (implicit EFLAGS)]>, OpSize;
3959 def rm : SS42AI<0x63, MRMSrcMem, (outs),
3960 (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
3961 "pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
3962 [(set ECX, (IntId128 VR128:$src1, (load addr:$src2), imm:$src3)),
3963 (implicit EFLAGS)]>, OpSize;
3967 defm PCMPISTRI : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128>;
3968 defm PCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128>;
3969 defm PCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128>;
3970 defm PCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128>;
3971 defm PCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128>;
3972 defm PCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128>;
3974 let Defs = [ECX, EFLAGS] in {
3975 let Uses = [EAX, EDX] in {
3976 multiclass SS42AI_pcmpestri<Intrinsic IntId128> {
3977 def rr : SS42AI<0x61, MRMSrcReg, (outs),
3978 (ins VR128:$src1, VR128:$src3, i8imm:$src5),
3979 "pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
3980 [(set ECX, (IntId128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5)),
3981 (implicit EFLAGS)]>, OpSize;
3982 def rm : SS42AI<0x61, MRMSrcMem, (outs),
3983 (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
3984 "pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
3986 (IntId128 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5)),
3987 (implicit EFLAGS)]>, OpSize;
3992 defm PCMPESTRI : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128>;
3993 defm PCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128>;
3994 defm PCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128>;
3995 defm PCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128>;
3996 defm PCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128>;
3997 defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;
3999 //===----------------------------------------------------------------------===//
4000 // AES-NI Instructions
4001 //===----------------------------------------------------------------------===//
4003 let Constraints = "$src1 = $dst" in {
4004 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
4005 Intrinsic IntId128, bit Commutable = 0> {
4006 def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
4007 (ins VR128:$src1, VR128:$src2),
4008 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4009 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4011 let isCommutable = Commutable;
4013 def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
4014 (ins VR128:$src1, i128mem:$src2),
4015 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4017 (IntId128 VR128:$src1,
4018 (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
4022 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
4023 int_x86_aesni_aesenc>;
4024 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
4025 int_x86_aesni_aesenclast>;
4026 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
4027 int_x86_aesni_aesdec>;
4028 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
4029 int_x86_aesni_aesdeclast>;
4031 def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
4032 (AESENCrr VR128:$src1, VR128:$src2)>;
4033 def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
4034 (AESENCrm VR128:$src1, addr:$src2)>;
4035 def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
4036 (AESENCLASTrr VR128:$src1, VR128:$src2)>;
4037 def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
4038 (AESENCLASTrm VR128:$src1, addr:$src2)>;
4039 def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
4040 (AESDECrr VR128:$src1, VR128:$src2)>;
4041 def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
4042 (AESDECrm VR128:$src1, addr:$src2)>;
4043 def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
4044 (AESDECLASTrr VR128:$src1, VR128:$src2)>;
4045 def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
4046 (AESDECLASTrm VR128:$src1, addr:$src2)>;
4048 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
4050 "aesimc\t{$src1, $dst|$dst, $src1}",
4052 (int_x86_aesni_aesimc VR128:$src1))]>,
4055 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
4056 (ins i128mem:$src1),
4057 "aesimc\t{$src1, $dst|$dst, $src1}",
4059 (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>,
4062 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
4063 (ins VR128:$src1, i8imm:$src2),
4064 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4066 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
4068 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
4069 (ins i128mem:$src1, i8imm:$src2),
4070 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4072 (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),