bool X86::isPSHUFDMask(SDNode *N) {
assert(N->getOpcode() == ISD::BUILD_VECTOR);
- if (N->getNumOperands() != 4)
+ if (N->getNumOperands() != 2 && N->getNumOperands() != 4)
return false;
// Check if the value doesn't reference the second vector.
SDOperand Arg = N->getOperand(i);
if (Arg.getOpcode() == ISD::UNDEF) continue;
assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
- if (cast<ConstantSDNode>(Arg)->getValue() >= 4)
+ if (cast<ConstantSDNode>(Arg)->getValue() >= e)
return false;
}
// If VT is integer, try PSHUF* first, then SHUFP*.
if (MVT::isInteger(VT)) {
- if (X86::isPSHUFDMask(PermMask.Val) ||
+ // MMX doesn't have PSHUFD; it does have PSHUFW. While it's theoretically
+ // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
+ if (((MVT::getSizeInBits(VT) != 64 || NumElems == 4) &&
+ X86::isPSHUFDMask(PermMask.Val)) ||
X86::isPSHUFHWMask(PermMask.Val) ||
X86::isPSHUFLWMask(PermMask.Val)) {
if (V2.getOpcode() != ISD::UNDEF)
"shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(v4f32 (vector_shuffle
- VR128:$src1, (load addr:$src2),
+ VR128:$src1, (memopv4f32 addr:$src2),
SHUFP_shuffle_mask:$src3)))]>;
let AddedComplexity = 10 in {
"unpckhps\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
(v4f32 (vector_shuffle
- VR128:$src1, (load addr:$src2),
+ VR128:$src1, (memopv4f32 addr:$src2),
UNPCKH_shuffle_mask)))]>;
def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
"unpcklps\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
(v4f32 (vector_shuffle
- VR128:$src1, (load addr:$src2),
+ VR128:$src1, (memopv4f32 addr:$src2),
UNPCKL_shuffle_mask)))]>;
} // AddedComplexity
} // isTwoAddress
"shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(v2f64 (vector_shuffle
- VR128:$src1, (load addr:$src2),
+ VR128:$src1, (memopv2f64 addr:$src2),
SHUFP_shuffle_mask:$src3)))]>;
let AddedComplexity = 10 in {
"unpckhpd\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
(v2f64 (vector_shuffle
- VR128:$src1, (load addr:$src2),
+ VR128:$src1, (memopv2f64 addr:$src2),
UNPCKH_shuffle_mask)))]>;
def UNPCKLPDrr : PDI<0x14, MRMSrcReg,
"unpcklpd\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
(v2f64 (vector_shuffle
- VR128:$src1, (load addr:$src2),
+ VR128:$src1, (memopv2f64 addr:$src2),
UNPCKL_shuffle_mask)))]>;
} // AddedComplexity
} // isTwoAddress
(outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
"pandn\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
- (load addr:$src2))))]>;
+ (memopv2i64 addr:$src2))))]>;
}
// SSE2 Integer comparison
SHUFP_unary_shuffle_mask:$sm),
(SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
Requires<[HasSSE1]>;
+// Special unary SHUFPDrri case.
+def : Pat<(vector_shuffle (v2f64 VR128:$src1), (undef),
+ SHUFP_unary_shuffle_mask:$sm),
+ (SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
+ Requires<[HasSSE2]>;
// Unary v4f32 shuffle with PSHUF* in order to fold a load.
def : Pat<(vector_shuffle (memopv4f32 addr:$src1), (undef),
SHUFP_unary_shuffle_mask:$sm),
(PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
- (load addr:$src2))),
+ (memopv2i64 addr:$src2))),
(PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
- (load addr:$src2))),
+ (memopv2i64 addr:$src2))),
(PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
- (load addr:$src2))),
+ (memopv2i64 addr:$src2))),
(PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
// Use movaps / movups for SSE integer load / store (one byte shorter).
--- /dev/null
+; RUN: llvm-as < %s | llc -march=x86-64 | grep unpck | wc -l | grep 2
+; RUN: llvm-as < %s | llc -march=x86-64 | grep shuf | wc -l | grep 2
+; RUN: llvm-as < %s | llc -march=x86-64 | grep ps | wc -l | grep 4
+; RUN: llvm-as < %s | llc -march=x86-64 | grep pd | wc -l | grep 4
+; RUN: llvm-as < %s | llc -march=x86-64 | grep movup | wc -l | grep 4
+
+define <4 x float> @a(<4 x float>* %y)
+{
+ %x = load <4 x float>* %y, align 4
+ %a = extractelement <4 x float> %x, i32 0
+ %b = extractelement <4 x float> %x, i32 1
+ %c = extractelement <4 x float> %x, i32 2
+ %d = extractelement <4 x float> %x, i32 3
+ %p = insertelement <4 x float> undef, float %d, i32 0
+ %q = insertelement <4 x float> %p, float %c, i32 1
+ %r = insertelement <4 x float> %q, float %b, i32 2
+ %s = insertelement <4 x float> %r, float %a, i32 3
+ ret <4 x float> %s
+}
+define <4 x float> @b(<4 x float>* %y, <4 x float> %z)
+{
+ %x = load <4 x float>* %y, align 4
+ %a = extractelement <4 x float> %x, i32 2
+ %b = extractelement <4 x float> %x, i32 3
+ %c = extractelement <4 x float> %z, i32 2
+ %d = extractelement <4 x float> %z, i32 3
+ %p = insertelement <4 x float> undef, float %c, i32 0
+ %q = insertelement <4 x float> %p, float %a, i32 1
+ %r = insertelement <4 x float> %q, float %d, i32 2
+ %s = insertelement <4 x float> %r, float %b, i32 3
+ ret <4 x float> %s
+}
+define <2 x double> @c(<2 x double>* %y)
+{
+ %x = load <2 x double>* %y, align 8
+ %a = extractelement <2 x double> %x, i32 0
+ %c = extractelement <2 x double> %x, i32 1
+ %p = insertelement <2 x double> undef, double %c, i32 0
+ %r = insertelement <2 x double> %p, double %a, i32 1
+ ret <2 x double> %r
+}
+define <2 x double> @d(<2 x double>* %y, <2 x double> %z)
+{
+ %x = load <2 x double>* %y, align 8
+ %a = extractelement <2 x double> %x, i32 1
+ %c = extractelement <2 x double> %z, i32 1
+ %p = insertelement <2 x double> undef, double %c, i32 0
+ %r = insertelement <2 x double> %p, double %a, i32 1
+ ret <2 x double> %r
+}