From 17bbdd05dd35093e3b7b2f5dfe850a54ac19137e Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Mon, 27 Apr 2015 12:57:59 +0000 Subject: [PATCH] AVX-512: Extend/Truncate operations for SKX, SETCC for bit-vectors git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@235875 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 74 +++++++++++++++++++- lib/Target/X86/X86InstrAVX512.td | 101 ++++++++++++++++++--------- test/CodeGen/X86/avx512-mask-op.ll | 26 +++++++ test/CodeGen/X86/avx512-trunc-ext.ll | 19 +++++ test/CodeGen/X86/avx512-vec-cmp.ll | 24 +++++++ 5 files changed, 208 insertions(+), 36 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c4d4a0f6ce7..bf61ab8ba4b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1303,6 +1303,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + if (Subtarget->hasDQI()) { + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + } setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); @@ -1313,7 +1317,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); - + if (Subtarget->hasDQI()) { + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); + } setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal); setOperationAction(ISD::FCEIL, MVT::v16f32, Legal); @@ -12055,6 +12062,23 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); + // move vector to mask - truncate solution for SKX + if (VT.getVectorElementType() == MVT::i1) { + if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI()) + return Op; // legal, will go to VPMOVB2M, VPMOVW2M + if ((InVT.is256BitVector() || InVT.is128BitVector()) + && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI() && Subtarget->hasVLX()) + return Op; // legal, will go to VPMOVB2M, VPMOVW2M + if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI()) + return Op; // legal, will go to VPMOVD2M, VPMOVQ2M + if ((InVT.is256BitVector() || InVT.is128BitVector()) + && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI() && Subtarget->hasVLX()) + return Op; // legal, will go to VPMOVB2M, VPMOVQ2M + } if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { if (VT.getVectorElementType().getSizeInBits() >=8) return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); @@ -13001,6 +13025,49 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } +static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + + assert(Op0.getValueType().getVectorElementType() == MVT::i1 && + "Unexpected type for boolean compare operation"); + ISD::CondCode SetCCOpcode = cast(CC)->get(); + SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0, + DAG.getConstant(-1, VT)); + SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1, + DAG.getConstant(-1, VT)); + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETNE: + // (x != y) -> ~(x ^ y) + return DAG.getNode(ISD::XOR, dl, VT, + DAG.getNode(ISD::XOR, dl, VT, Op0, Op1), + DAG.getConstant(-1, VT)); + case ISD::SETEQ: + // (x == y) -> (x ^ y) + return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1); + case ISD::SETUGT: + case ISD::SETGT: + // (x > y) -> (x & ~y) + return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1); + case ISD::SETULT: + case ISD::SETLT: + // (x < y) -> (~x & y) + return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1); + case ISD::SETULE: + case ISD::SETLE: + // (x <= y) -> (~x | y) + return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1); + case ISD::SETUGE: + case ISD::SETGE: + // (x >=y) -> (x | ~y) + return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1); + } +} + static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue Op0 = Op.getOperand(0); @@ -13119,8 +13186,11 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); - bool MaskResult = (VT.getVectorElementType() == MVT::i1); EVT OpVT = Op1.getValueType(); + if (OpVT.getVectorElementType() == MVT::i1) + return LowerBoolVSETCC_AVX512(Op, DAG); + + bool MaskResult = (VT.getVectorElementType() == MVT::i1); if (Subtarget->hasAVX512()) { if (Op1.getValueType().is512BitVector() || (Subtarget->hasBWI() && Subtarget->hasVLX()) || diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 60b6310ecc5..db15dfaca4f 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1906,18 +1906,21 @@ def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>; let Predicates = [HasAVX512, NoDQI] in { def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; - def : Pat<(not VK8:$src), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; } +def : Pat<(xor VK4:$src1, (v4i1 immAllOnesV)), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src1, VK16)), VK4)>; +def : Pat<(xor VK2:$src1, (v2i1 immAllOnesV)), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src1, VK16)), VK2)>; // Mask binary operation // - KAND, KANDN, KOR, KXNOR, KXOR multiclass avx512_mask_binop opc, string OpcodeStr, RegisterClass KRC, SDPatternOperator OpNode, - Predicate prd> { - let Predicates = [prd] in + Predicate prd, bit IsCommutable> { + let Predicates = [prd], isCommutable = IsCommutable in def rr : I opc, string OpcodeStr, } multiclass avx512_mask_binop_all opc, string OpcodeStr, - SDPatternOperator OpNode> { + SDPatternOperator OpNode, bit IsCommutable> { defm B : avx512_mask_binop, VEX_4V, VEX_L, PD; + HasDQI, IsCommutable>, VEX_4V, VEX_L, PD; defm W : avx512_mask_binop, VEX_4V, VEX_L, PS; + HasAVX512, IsCommutable>, VEX_4V, VEX_L, PS; defm D : avx512_mask_binop, VEX_4V, VEX_L, VEX_W, PD; + HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD; defm Q : avx512_mask_binop, VEX_4V, VEX_L, VEX_W, PS; + HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS; } def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; -let isCommutable = 1 in { - defm KAND : avx512_mask_binop_all<0x41, "kand", and>; - defm KOR : avx512_mask_binop_all<0x45, "kor", or>; - defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor>; - defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor>; -} -let isCommutable = 0 in - defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn>; - -def : Pat<(xor VK1:$src1, VK1:$src2), - (COPY_TO_REGCLASS (KXORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), - (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; - -def : Pat<(or VK1:$src1, VK1:$src2), - (COPY_TO_REGCLASS (KORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), - (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; - -def : Pat<(and VK1:$src1, VK1:$src2), - (COPY_TO_REGCLASS (KANDWrr (COPY_TO_REGCLASS VK1:$src1, VK16), - (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; +defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>; +defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>; +defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>; +defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; +defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>; multiclass avx512_mask_binop_int { let Predicates = [HasAVX512] in @@ -1975,13 +1963,28 @@ defm : avx512_mask_binop_int<"kor", "KOR">; defm : avx512_mask_binop_int<"kxnor", "KXNOR">; defm : avx512_mask_binop_int<"kxor", "KXOR">; -// With AVX-512, 8-bit mask is promoted to 16-bit mask. multiclass avx512_binop_pat { - let Predicates = [HasAVX512] in - def : Pat<(OpNode VK8:$src1, VK8:$src2), - (COPY_TO_REGCLASS - (Inst (COPY_TO_REGCLASS VK8:$src1, VK16), - (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; + // With AVX512F, 8-bit mask is promoted to 16-bit mask, + // for the DQI set, this type is legal and KxxxB instruction is used + let Predicates = [NoDQI] in + def : Pat<(OpNode VK8:$src1, VK8:$src2), + (COPY_TO_REGCLASS + (Inst (COPY_TO_REGCLASS VK8:$src1, VK16), + (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; + + // All types smaller than 8 bits require conversion anyway + def : Pat<(OpNode VK1:$src1, VK1:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK1:$src1, VK16), + (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; + def : Pat<(OpNode VK2:$src1, VK2:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK2:$src1, VK16), + (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>; + def : Pat<(OpNode VK4:$src1, VK4:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK4:$src1, VK16), + (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>; } defm : avx512_binop_pat; @@ -1990,6 +1993,32 @@ defm : avx512_binop_pat; defm : avx512_binop_pat; defm : avx512_binop_pat; +def : Pat<(xor (xor VK16:$src1, VK16:$src2), (v16i1 immAllOnesV)), + (KXNORWrr VK16:$src1, VK16:$src2)>; +def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)), + (KXNORBrr VK8:$src1, VK8:$src2)>; +def : Pat<(xor (xor VK32:$src1, VK32:$src2), (v32i1 immAllOnesV)), + (KXNORDrr VK32:$src1, VK32:$src2)>; +def : Pat<(xor (xor VK64:$src1, VK64:$src2), (v64i1 immAllOnesV)), + (KXNORQrr VK64:$src1, VK64:$src2)>; + +let Predicates = [NoDQI] in +def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK8:$src1, VK16), + (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; + +def : Pat<(xor (xor VK4:$src1, VK4:$src2), (v4i1 immAllOnesV)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK4:$src1, VK16), + (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>; + +def : Pat<(xor (xor VK2:$src1, VK2:$src2), (v2i1 immAllOnesV)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK2:$src1, VK16), + (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>; + +def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), + (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; + // Mask unpacking multiclass avx512_mask_unpck opc, string OpcodeStr, RegisterClass KRC> { @@ -2085,6 +2114,8 @@ multiclass avx512_mask_setop { multiclass avx512_mask_setop_w { defm B : avx512_mask_setop; defm W : avx512_mask_setop; + defm D : avx512_mask_setop; + defm Q : avx512_mask_setop; } defm KSET0 : avx512_mask_setop_w; @@ -2094,6 +2125,8 @@ defm KSET1 : avx512_mask_setop_w; let Predicates = [HasAVX512] in { def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; + def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>; + def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>; def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>; def : Pat<(i1 1), (COPY_TO_REGCLASS (KSET1W), VK1)>; def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSET1W), VK1)>; diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index c4e62517eb0..677524a9565 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -131,4 +131,30 @@ entry: %mask = load <8 x i1>, <8 x i1>* %maskPtr %mask_convert = bitcast <8 x i1> %mask to i8 ret i8 %mask_convert +} + +; SKX-LABEL: test4 +; SKX: vpcmpgt +; SKX: knot +; SKX: vpcmpgt +; SKX: vpmovm2d +define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) { + %x_gt_y = icmp sgt <4 x i64> %x, %y + %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1 + %res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1 + %resse = sext <4 x i1>%res to <4 x i32> + ret <4 x i32> %resse +} + +; SKX-LABEL: test5 +; SKX: vpcmpgt +; SKX: knot +; SKX: vpcmpgt +; SKX: vpmovm2q +define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) { + %x_gt_y = icmp slt <2 x i64> %x, %y + %x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1 + %res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1 + %resse = sext <2 x i1>%res to <2 x i64> + ret <2 x i64> %resse } \ No newline at end of file diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll index 91ef5d58f43..09806e3ffb5 100644 --- a/test/CodeGen/X86/avx512-trunc-ext.ll +++ b/test/CodeGen/X86/avx512-trunc-ext.ll @@ -90,6 +90,8 @@ define <8 x i64> @zext_8i1_to_8xi64(i8 %b) { ; CHECK: vpandd ; CHECK: vptestmd ; CHECK: ret +; SKX-LABEL: trunc_16i8_to_16i1 +; SKX: vpmovb2m %xmm define i16 @trunc_16i8_to_16i1(<16 x i8> %a) { %mask_b = trunc <16 x i8>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 @@ -100,17 +102,34 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) { ; CHECK: vpandd ; CHECK: vptestmd ; CHECK: ret +; SKX-LABEL: trunc_16i32_to_16i1 +; SKX: vpmovd2m %zmm define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { %mask_b = trunc <16 x i32>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 ret i16 %mask } +; SKX-LABEL: trunc_4i32_to_4i1 +; SKX: vpmovd2m %xmm +; SKX: kandw +; SKX: vpmovm2d +define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) { + %mask_a = trunc <4 x i32>%a to <4 x i1> + %mask_b = trunc <4 x i32>%b to <4 x i1> + %a_and_b = and <4 x i1>%mask_a, %mask_b + %res = sext <4 x i1>%a_and_b to <4 x i32> + ret <4 x i32>%res +} + ; CHECK-LABEL: trunc_8i16_to_8i1 ; CHECK: vpmovsxwq ; CHECK: vpandq LCP{{.*}}(%rip){1to8} ; CHECK: vptestmq ; CHECK: ret + +; SKX-LABEL: trunc_8i16_to_8i1 +; SKX: vpmovw2m %xmm define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { %mask_b = trunc <8 x i16>%a to <8 x i1> %mask = bitcast <8 x i1> %mask_b to i8 diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index 26e2c776b25..4808ea9f854 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -374,3 +374,27 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 ret <8 x i64> %max } + +; CHECK-LABEL: test28 +; CHECK: vpcmpgtq +; CHECK: vpcmpgtq +; CHECK: kxorw +define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) { + %x_gt_y = icmp sgt <8 x i64> %x, %y + %x1_gt_y1 = icmp sgt <8 x i64> %x1, %y1 + %res = icmp eq <8 x i1>%x_gt_y, %x1_gt_y1 + %resse = sext <8 x i1>%res to <8 x i32> + ret <8 x i32> %resse +} + +; CHECK-LABEL: test29 +; CHECK: vpcmpgtd +; CHECK: vpcmpgtd +; CHECK: kxnorw +define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) { + %x_gt_y = icmp sgt <16 x i32> %x, %y + %x1_gt_y1 = icmp sgt <16 x i32> %x1, %y1 + %res = icmp ne <16 x i1>%x_gt_y, %x1_gt_y1 + %resse = sext <16 x i1>%res to <16 x i8> + ret <16 x i8> %resse +} \ No newline at end of file -- 2.34.1