SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
+ case Intrinsic::x86_avx512_kortestz:
+ case Intrinsic::x86_avx512_kortestc: {
+ unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B;
+ SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
+ SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
+ SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+ SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
// SSE/AVX shift intrinsics
case Intrinsic::x86_sse2_psll_w:
if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
(Subtarget->hasInt256() &&
- (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
+ (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+ (Subtarget->hasAVX512() &&
+ (VT == MVT::v8i64 || VT == MVT::v16i32))) {
if (Op.getOpcode() == ISD::SHL)
return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
DAG.getConstant(ShiftAmt, MVT::i32));
VT == MVT::v4i32 || VT == MVT::v8i16 ||
(Subtarget->hasInt256() &&
((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
- VT == MVT::v8i32 || VT == MVT::v16i16))) {
+ VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+ (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
SDValue BaseShAmt;
EVT EltVT = VT.getVectorElementType();
case MVT::v4i64:
case MVT::v8i32:
case MVT::v16i16:
+ case MVT::v16i32:
+ case MVT::v8i64:
return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
}
case ISD::SRA:
case MVT::v8i16:
case MVT::v8i32:
case MVT::v16i16:
+ case MVT::v16i32:
+ case MVT::v8i64:
return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
}
case ISD::SRL:
case MVT::v4i64:
case MVT::v8i32:
case MVT::v16i16:
+ case MVT::v16i32:
+ case MVT::v8i64:
return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
}
}
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
if (!Subtarget->is64Bit() &&
- (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+ (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
+ (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
Amt.getOpcode() == ISD::BITCAST &&
Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
Amt = Amt.getOperand(0);
if (V.getNode())
return V;
+ if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
+ return Op;
// AVX2 has VPSLLV/VPSRAV/VPSRLV.
if (Subtarget->hasInt256()) {
if (Op.getOpcode() == ISD::SRL &&
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::PTEST: return "X86ISD::PTEST";
case X86ISD::TESTP: return "X86ISD::TESTP";
+ case X86ISD::TESTM: return "X86ISD::TESTM";
+ case X86ISD::KORTEST: return "X86ISD::KORTEST";
+ case X86ISD::KTEST: return "X86ISD::KTEST";
case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
SSE_ALU_ITINS_P.d, 0>,
EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+//===----------------------------------------------------------------------===//
+// AVX-512 VPTESTM instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag,
+ SDNode OpNode, ValueType vt> {
+ def rr : AVX5128I<opc, MRMSrcReg,
+ (outs KRC:$dst), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))]>, EVEX_4V;
+ def rm : AVX5128I<opc, MRMSrcMem,
+ (outs KRC:$dst), (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set KRC:$dst, (OpNode (vt RC:$src1),
+ (bitconvert (memop_frag addr:$src2))))]>, EVEX_4V;
+}
+
+defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem,
+ memopv16i32, X86testm, v16i32>, EVEX_V512,
+ EVEX_CD8<32, CD8VF>;
+defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, memopv8i64,
+ X86testm, v8i64>, EVEX_V512, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 Shift instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
+ string OpcodeStr,
+ SDNode OpNode, RegisterClass RC, ValueType vt,
+ X86MemOperand x86memop, PatFrag mem_frag> {
+ def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
+ (ins RC:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (i32 imm:$src2))))],
+ SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
+ def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
+ (ins x86memop:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (OpNode (mem_frag addr:$src1),
+ (i32 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
+}
+
+multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, ValueType vt, ValueType SrcVT,
+ PatFrag bc_frag> {
+ // src2 is always 128-bit
+ def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, VR128X:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))],
+ SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
+ def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst, (vt (OpNode RC:$src1,
+ (bc_frag (memopv2i64 addr:$src2)))))],
+ SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
+}
+
+defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli,
+ VR512, v16i32, i512mem, memopv16i32>, EVEX_V512,
+ EVEX_CD8<32, CD8VF>;
+defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl,
+ VR512, v16i32, v4i32, bc_v4i32>, EVEX_V512,
+ EVEX_CD8<32, CD8VQ>;
+
+defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli,
+ VR512, v8i64, i512mem, memopv8i64>, EVEX_V512,
+ EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl,
+ VR512, v8i64, v2i64, bc_v2i64>, EVEX_V512,
+ EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli,
+ VR512, v16i32, i512mem, memopv16i32>, EVEX_V512,
+ EVEX_CD8<32, CD8VF>;
+defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl,
+ VR512, v16i32, v4i32, bc_v4i32>, EVEX_V512,
+ EVEX_CD8<32, CD8VQ>;
+
+defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli,
+ VR512, v8i64, i512mem, memopv8i64>, EVEX_V512,
+ EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl,
+ VR512, v8i64, v2i64, bc_v2i64>, EVEX_V512,
+ EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai,
+ VR512, v16i32, i512mem, memopv16i32>, EVEX_V512,
+ EVEX_CD8<32, CD8VF>;
+defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra,
+ VR512, v16i32, v4i32, bc_v4i32>, EVEX_V512,
+ EVEX_CD8<32, CD8VQ>;
+
+defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai,
+ VR512, v8i64, i512mem, memopv8i64>, EVEX_V512,
+ EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra,
+ VR512, v8i64, v2i64, bc_v2i64>, EVEX_V512,
+ EVEX_CD8<64, CD8VQ>, VEX_W;
+
+//===-------------------------------------------------------------------===//
+// Variable Bit Shifts
+//===-------------------------------------------------------------------===//
+multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ RegisterClass RC, ValueType vt,
+ X86MemOperand x86memop, PatFrag mem_frag> {
+ def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (vt (OpNode RC:$src1, (vt RC:$src2))))]>,
+ EVEX_4V;
+ def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>,
+ EVEX_4V;
+}
+
+defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32,
+ i512mem, memopv16i32>, EVEX_V512,
+ EVEX_CD8<32, CD8VF>;
+defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64,
+ i512mem, memopv8i64>, EVEX_V512, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32,
+ i512mem, memopv16i32>, EVEX_V512,
+ EVEX_CD8<32, CD8VF>;
+defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64,
+ i512mem, memopv8i64>, EVEX_V512, VEX_W,
+ EVEX_CD8<64, CD8VF>;
+defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32,
+ i512mem, memopv16i32>, EVEX_V512,
+ EVEX_CD8<32, CD8VF>;
+defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64,
+ i512mem, memopv8i64>, EVEX_V512, VEX_W,
+ EVEX_CD8<64, CD8VF>;
--- /dev/null
+;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+;CHECK-LABEL: shift_16_i32
+;CHECK: vpsrld
+;CHECK: vpslld
+;CHECK: vpsrad
+;CHECK: ret
+define <16 x i32> @shift_16_i32(<16 x i32> %a) {
+ %b = lshr <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %c = shl <16 x i32> %b, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
+ %d = ashr <16 x i32> %c, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
+ ret <16 x i32> %d;
+}
+
+;CHECK-LABEL: shift_8_i64
+;CHECK: vpsrlq
+;CHECK: vpsllq
+;CHECK: vpsraq
+;CHECK: ret
+define <8 x i64> @shift_8_i64(<8 x i64> %a) {
+ %b = lshr <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+ %c = shl <8 x i64> %b, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
+ %d = ashr <8 x i64> %c, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
+ ret <8 x i64> %d;
+}
+
+; CHECK-LABEL: variable_shl4
+; CHECK: vpsllvq %zmm
+; CHECK: ret
+define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) {
+ %k = shl <8 x i64> %x, %y
+ ret <8 x i64> %k
+}
+
+; CHECK-LABEL: variable_shl5
+; CHECK: vpsllvd %zmm
+; CHECK: ret
+define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) {
+ %k = shl <16 x i32> %x, %y
+ ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_srl0
+; CHECK: vpsrlvd
+; CHECK: ret
+define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) {
+ %k = lshr <16 x i32> %x, %y
+ ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_srl2
+; CHECK: psrlvq
+; CHECK: ret
+define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) {
+ %k = lshr <8 x i64> %x, %y
+ ret <8 x i64> %k
+}
+
+; CHECK-LABEL: variable_sra1
+; CHECK: vpsravd
+; CHECK: ret
+define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) {
+ %k = ashr <16 x i32> %x, %y
+ ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_sra2
+; CHECK: vpsravq %zmm
+; CHECK: ret
+define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) {
+ %k = ashr <8 x i64> %x, %y
+ ret <8 x i64> %k
+}
+
+; CHECK-LABEL: variable_sra01_load
+; CHECK: vpsravd (%
+; CHECK: ret
+define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) {
+ %y1 = load <16 x i32>* %y
+ %k = ashr <16 x i32> %x, %y1
+ ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_shl1_load
+; CHECK: vpsllvd (%
+; CHECK: ret
+define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) {
+ %y1 = load <16 x i32>* %y
+ %k = shl <16 x i32> %x, %y1
+ ret <16 x i32> %k
+}
+; CHECK: variable_srl0_load
+; CHECK: vpsrlvd (%
+; CHECK: ret
+define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) {
+ %y1 = load <16 x i32>* %y
+ %k = lshr <16 x i32> %x, %y1
+ ret <16 x i32> %k
+}
+
+; CHECK: variable_srl3_load
+; CHECK: vpsrlvq (%
+; CHECK: ret
+define <8 x i64> @variable_srl3_load(<8 x i64> %x, <8 x i64>* %y) {
+ %y1 = load <8 x i64>* %y
+ %k = lshr <8 x i64> %x, %y1
+ ret <8 x i64> %k
+}