From 73ae1df82c09dfde3c7069def938c9638d6ff1e2 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Thu, 4 Dec 2014 09:40:44 +0000 Subject: [PATCH] Masked Load / Store Intrinsics - the CodeGen part. I'm recommiting the codegen part of the patch. The vectorizer part will be send to review again. Masked Vector Load and Store Intrinsics. Introduced new target-independent intrinsics in order to support masked vector loads and stores. The loop vectorizer optimizes loops containing conditional memory accesses by generating these intrinsics for existing targets AVX2 and AVX-512. The vectorizer asks the target about availability of masked vector loads and stores. Added SDNodes for masked operations and lowering patterns for X86 code generator. Examples: <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32> %passthru, i32 4 /* align */, <16 x i1> %mask) declare void @llvm.masked.store.v8f64(i8* %addr, <8 x double> %value, i32 4, <8 x i1> %mask) Scalarizer for other targets (not AVX2/AVX-512) will be done in a separate patch. http://reviews.llvm.org/D6191 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@223348 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 7 + include/llvm/CodeGen/ISDOpcodes.h | 3 + include/llvm/CodeGen/SelectionDAG.h | 4 + include/llvm/CodeGen/SelectionDAGNodes.h | 68 ++++++++ include/llvm/IR/IRBuilder.h | 11 ++ include/llvm/IR/Intrinsics.h | 11 +- include/llvm/IR/Intrinsics.td | 15 ++ include/llvm/Target/TargetSelectionDAG.td | 13 ++ lib/Analysis/TargetTransformInfo.cpp | 11 ++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 160 ++++++++++++++++++ .../SelectionDAG/LegalizeIntegerTypes.cpp | 23 +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h | 4 + .../SelectionDAG/LegalizeVectorTypes.cpp | 114 +++++++++++++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 54 ++++++ .../SelectionDAG/SelectionDAGBuilder.cpp | 70 ++++++++ .../SelectionDAG/SelectionDAGBuilder.h | 2 + .../SelectionDAG/SelectionDAGDumper.cpp | 2 + lib/IR/Function.cpp | 18 +- lib/IR/IRBuilder.cpp | 26 +++ lib/IR/Verifier.cpp | 13 ++ lib/Target/X86/X86ISelLowering.cpp | 22 ++- lib/Target/X86/X86InstrAVX512.td | 75 ++++++++ lib/Target/X86/X86InstrSSE.td | 55 ++++++ lib/Target/X86/X86TargetTransformInfo.cpp | 18 ++ test/CodeGen/X86/masked_memop.ll | 73 ++++++++ utils/TableGen/CodeGenTarget.cpp | 3 +- utils/TableGen/IntrinsicEmitter.cpp | 10 +- 27 files changed, 873 insertions(+), 12 deletions(-) create mode 100644 test/CodeGen/X86/masked_memop.ll diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 9acaaa6f2eb..8af8f77ad67 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -270,6 +270,13 @@ public: int64_t BaseOffset, bool HasBaseReg, int64_t Scale) const; + /// \brief Return true if the target works with masked instruction + /// AVX2 allows masks for consecutive load and store for i32 and i64 elements. + /// AVX-512 architecture will also allow masks for non-consecutive memory + /// accesses. + virtual bool isLegalPredicatedStore(Type *DataType, int Consecutive) const; + virtual bool isLegalPredicatedLoad (Type *DataType, int Consecutive) const; + /// \brief Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index bbf0ad30458..4f55ccac16e 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -675,6 +675,9 @@ namespace ISD { ATOMIC_LOAD_UMIN, ATOMIC_LOAD_UMAX, + // Masked load and store + MLOAD, MSTORE, + /// This corresponds to the llvm.lifetime.* intrinsics. The first operand /// is the chain and the second operand is the alloca pointer. LIFETIME_START, LIFETIME_END, diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h index fbdaf0d7fd2..4950797bb1e 100644 --- a/include/llvm/CodeGen/SelectionDAG.h +++ b/include/llvm/CodeGen/SelectionDAG.h @@ -866,6 +866,10 @@ public: SDValue getIndexedStore(SDValue OrigStoe, SDLoc dl, SDValue Base, SDValue Offset, ISD::MemIndexedMode AM); + SDValue getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, + SDValue Mask, SDValue Src0, MachineMemOperand *MMO); + SDValue getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val, + SDValue Ptr, SDValue Mask, MachineMemOperand *MMO); /// getSrcValue - Construct a node to track a Value* through the backend. SDValue getSrcValue(const Value *v); diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h index 47158272412..acd178892d0 100644 --- a/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1177,6 +1177,8 @@ public: N->getOpcode() == ISD::ATOMIC_LOAD_UMAX || N->getOpcode() == ISD::ATOMIC_LOAD || N->getOpcode() == ISD::ATOMIC_STORE || + N->getOpcode() == ISD::MLOAD || + N->getOpcode() == ISD::MSTORE || N->isMemIntrinsic() || N->isTargetMemoryOpcode(); } @@ -1926,6 +1928,72 @@ public: } }; +/// MaskedLoadStoreSDNode - This is a base class is used to represent MLOAD and +/// MSTORE nodes +/// +class MaskedLoadStoreSDNode : public MemSDNode { + // Operands + SDUse Ops[4]; +public: + friend class SelectionDAG; + MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl, + SDValue *Operands, unsigned numOperands, + SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) + : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) { + InitOperands(Ops, Operands, numOperands); + } + + // In the both nodes address is Op1, mask is Op2: + // MaskedLoadSDNode (Chain, ptr, mask, src0), src0 is a passthru value + // MaskedStoreSDNode (Chain, ptr, mask, data) + // Mask is a vector of i1 elements + const SDValue &getBasePtr() const { return getOperand(1); } + const SDValue &getMask() const { return getOperand(2); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::MLOAD || + N->getOpcode() == ISD::MSTORE; + } +}; + +/// MaskedLoadSDNode - This class is used to represent an MLOAD node +/// +class MaskedLoadSDNode : public MaskedLoadStoreSDNode { +public: + friend class SelectionDAG; + MaskedLoadSDNode(unsigned Order, DebugLoc dl, + SDValue *Operands, unsigned numOperands, + SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) + : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, Operands, numOperands, + VTs, MemVT, MMO) + {} + + const SDValue &getSrc0() const { return getOperand(3); } + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::MLOAD; + } +}; + +/// MaskedStoreSDNode - This class is used to represent an MSTORE node +/// +class MaskedStoreSDNode : public MaskedLoadStoreSDNode { + +public: + friend class SelectionDAG; + MaskedStoreSDNode(unsigned Order, DebugLoc dl, + SDValue *Operands, unsigned numOperands, + SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) + : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, Operands, numOperands, + VTs, MemVT, MMO) + {} + + const SDValue &getData() const { return getOperand(3); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::MSTORE; + } +}; + /// MachineSDNode - An SDNode that represents everything that will be needed /// to construct a MachineInstr. These nodes are created during the /// instruction selection proper phase. diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h index 80ada57f4c8..e564ca52adf 100644 --- a/include/llvm/IR/IRBuilder.h +++ b/include/llvm/IR/IRBuilder.h @@ -429,11 +429,22 @@ public: /// If the pointer isn't i8* it will be converted. CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = nullptr); + /// \brief Create a call to Masked Load intrinsic + CallInst *CreateMaskedLoad(ArrayRef Ops); + + /// \brief Create a call to Masked Store intrinsic + CallInst *CreateMaskedStore(ArrayRef Ops); + /// \brief Create an assume intrinsic call that allows the optimizer to /// assume that the provided condition will be true. CallInst *CreateAssumption(Value *Cond); private: + /// \brief Create a call to a masked intrinsic with given Id. + /// Masked intrinsic has only one overloaded type - data type. + CallInst *CreateMaskedIntrinsic(unsigned Id, ArrayRef Ops, + Type *DataTy); + Value *getCastedInt8PtrValue(Value *Ptr); }; diff --git a/include/llvm/IR/Intrinsics.h b/include/llvm/IR/Intrinsics.h index acc0e9e5d37..5236f195140 100644 --- a/include/llvm/IR/Intrinsics.h +++ b/include/llvm/IR/Intrinsics.h @@ -76,7 +76,8 @@ namespace Intrinsic { enum IITDescriptorKind { Void, VarArg, MMX, Metadata, Half, Float, Double, Integer, Vector, Pointer, Struct, - Argument, ExtendArgument, TruncArgument, HalfVecArgument + Argument, ExtendArgument, TruncArgument, HalfVecArgument, + SameVecWidthArgument } Kind; union { @@ -96,13 +97,15 @@ namespace Intrinsic { }; unsigned getArgumentNumber() const { assert(Kind == Argument || Kind == ExtendArgument || - Kind == TruncArgument || Kind == HalfVecArgument); + Kind == TruncArgument || Kind == HalfVecArgument || + Kind == SameVecWidthArgument); return Argument_Info >> 2; } ArgKind getArgumentKind() const { assert(Kind == Argument || Kind == ExtendArgument || - Kind == TruncArgument || Kind == HalfVecArgument); - return (ArgKind)(Argument_Info&3); + Kind == TruncArgument || Kind == HalfVecArgument || + Kind == SameVecWidthArgument); + return (ArgKind)(Argument_Info & 3); } static IITDescriptor get(IITDescriptorKind K, unsigned Field) { diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index 5457e9577c4..08bdf3a2642 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -112,6 +112,10 @@ class LLVMMatchType // the intrinsic is overloaded, so the matched type should be declared as iAny. class LLVMExtendedType : LLVMMatchType; class LLVMTruncatedType : LLVMMatchType; +class LLVMVectorSameWidth + : LLVMMatchType { + ValueType ElTy = elty.VT; +} // Match the type of another intrinsic parameter that is expected to be a // vector type, but change the element count to be half as many @@ -555,6 +559,17 @@ def int_convertuu : Intrinsic<[llvm_anyint_ty], def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [], "llvm.clear_cache">; +//===-------------------------- Masked Intrinsics -------------------------===// +// +def int_masked_store : Intrinsic<[], [llvm_ptr_ty, llvm_anyvector_ty, + llvm_i32_ty, + LLVMVectorSameWidth<0, llvm_i1_ty>], + [IntrReadWriteArgMem]>; + +def int_masked_load : Intrinsic<[llvm_anyvector_ty], + [llvm_ptr_ty, LLVMMatchType<0>, llvm_i32_ty, + LLVMVectorSameWidth<0, llvm_i1_ty>], + [IntrReadArgMem]>; //===----------------------------------------------------------------------===// // Target-specific intrinsics //===----------------------------------------------------------------------===// diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td index f63afd70983..907baa1b9b1 100644 --- a/include/llvm/Target/TargetSelectionDAG.td +++ b/include/llvm/Target/TargetSelectionDAG.td @@ -188,6 +188,14 @@ def SDTIStore : SDTypeProfile<1, 3, [ // indexed store SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3> ]>; +def SDTMaskedStore: SDTypeProfile<0, 3, [ // masked store + SDTCisPtrTy<0>, SDTCisVec<1>, SDTCisVec<2> +]>; + +def SDTMaskedLoad: SDTypeProfile<1, 3, [ // masked load + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameAs<0, 3> +]>; + def SDTVecShuffle : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> ]>; @@ -454,6 +462,11 @@ def atomic_load : SDNode<"ISD::ATOMIC_LOAD", SDTAtomicLoad, def atomic_store : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def masked_store : SDNode<"ISD::MSTORE", SDTMaskedStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def masked_load : SDNode<"ISD::MLOAD", SDTMaskedLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + // Do not use ld, st directly. Use load, extload, sextload, zextload, store, // and truncst (see below). def ld : SDNode<"ISD::LOAD" , SDTLoad, diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 3d8a37811ba..d10bdd83150 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -101,6 +101,17 @@ bool TargetTransformInfo::isLegalICmpImmediate(int64_t Imm) const { return PrevTTI->isLegalICmpImmediate(Imm); } +bool TargetTransformInfo::isLegalPredicatedLoad(Type *DataType, + int Consecutive) const { + return false; +} + +bool TargetTransformInfo::isLegalPredicatedStore(Type *DataType, + int Consecutive) const { + return false; +} + + bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 201429fe754..7347111728e 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -303,6 +303,8 @@ namespace { SDValue visitEXTRACT_SUBVECTOR(SDNode *N); SDValue visitVECTOR_SHUFFLE(SDNode *N); SDValue visitINSERT_SUBVECTOR(SDNode *N); + SDValue visitMLOAD(SDNode *N); + SDValue visitMSTORE(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS); @@ -1351,6 +1353,8 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N); case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N); case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N); + case ISD::MLOAD: return visitMLOAD(N); + case ISD::MSTORE: return visitMSTORE(N); } return SDValue(); } @@ -4771,6 +4775,162 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1)); } +SDValue DAGCombiner::visitMSTORE(SDNode *N) { + + if (Level >= AfterLegalizeTypes) + return SDValue(); + + MaskedStoreSDNode *MST = dyn_cast(N); + SDValue Mask = MST->getMask(); + SDValue Data = MST->getData(); + SDLoc DL(N); + + // If the MSTORE data type requires splitting and the mask is provided by a + // SETCC, then split both nodes and its operands before legalization. This + // prevents the type legalizer from unrolling SETCC into scalar comparisons + // and enables future optimizations (e.g. min/max pattern matching on X86). + if (Mask.getOpcode() == ISD::SETCC) { + + // Check if any splitting is required. + if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) != + TargetLowering::TypeSplitVector) + return SDValue(); + + SDValue MaskLo, MaskHi, Lo, Hi; + std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MST->getValueType(0)); + + SDValue Chain = MST->getChain(); + SDValue Ptr = MST->getBasePtr(); + + EVT MemoryVT = MST->getMemoryVT(); + unsigned Alignment = MST->getOriginalAlignment(); + + // if Alignment is equal to the vector size, + // take the half of it for the second part + unsigned SecondHalfAlignment = + (Alignment == Data->getValueType(0).getSizeInBits()/8) ? + Alignment/2 : Alignment; + + EVT LoMemVT, HiMemVT; + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + + SDValue DataLo, DataHi; + std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); + + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MST->getPointerInfo(), + MachineMemOperand::MOStore, LoMemVT.getStoreSize(), + Alignment, MST->getAAInfo(), MST->getRanges()); + + Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, MMO); + + unsigned IncrementSize = LoMemVT.getSizeInBits()/8; + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(IncrementSize, Ptr.getValueType())); + + MMO = DAG.getMachineFunction(). + getMachineMemOperand(MST->getPointerInfo(), + MachineMemOperand::MOStore, HiMemVT.getStoreSize(), + SecondHalfAlignment, MST->getAAInfo(), + MST->getRanges()); + + Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, MMO); + + AddToWorklist(Lo.getNode()); + AddToWorklist(Hi.getNode()); + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); + } + return SDValue(); +} + +SDValue DAGCombiner::visitMLOAD(SDNode *N) { + + if (Level >= AfterLegalizeTypes) + return SDValue(); + + MaskedLoadSDNode *MLD = dyn_cast(N); + SDValue Mask = MLD->getMask(); + SDLoc DL(N); + + // If the MLOAD result requires splitting and the mask is provided by a + // SETCC, then split both nodes and its operands before legalization. This + // prevents the type legalizer from unrolling SETCC into scalar comparisons + // and enables future optimizations (e.g. min/max pattern matching on X86). + + if (Mask.getOpcode() == ISD::SETCC) { + EVT VT = N->getValueType(0); + + // Check if any splitting is required. + if (TLI.getTypeAction(*DAG.getContext(), VT) != + TargetLowering::TypeSplitVector) + return SDValue(); + + SDValue MaskLo, MaskHi, Lo, Hi; + std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); + + SDValue Src0 = MLD->getSrc0(); + SDValue Src0Lo, Src0Hi; + std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0)); + + SDValue Chain = MLD->getChain(); + SDValue Ptr = MLD->getBasePtr(); + EVT MemoryVT = MLD->getMemoryVT(); + unsigned Alignment = MLD->getOriginalAlignment(); + + // if Alignment is equal to the vector size, + // take the half of it for the second part + unsigned SecondHalfAlignment = + (Alignment == MLD->getValueType(0).getSizeInBits()/8) ? + Alignment/2 : Alignment; + + EVT LoMemVT, HiMemVT; + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MLD->getPointerInfo(), + MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), + Alignment, MLD->getAAInfo(), MLD->getRanges()); + + Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, MMO); + + unsigned IncrementSize = LoMemVT.getSizeInBits()/8; + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(IncrementSize, Ptr.getValueType())); + + MMO = DAG.getMachineFunction(). + getMachineMemOperand(MLD->getPointerInfo(), + MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), + SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges()); + + Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, MMO); + + AddToWorklist(Lo.getNode()); + AddToWorklist(Hi.getNode()); + + // Build a factor node to remember that this load is independent of the + // other one. + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Legalized the chain result - switch anything that used the old chain to + // use the new one. + DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain); + + SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); + + SDValue RetOps[] = { LoadRes, Chain }; + return DAG.getMergeValues(RetOps, DL); + } + return SDValue(); +} + SDValue DAGCombiner::visitVSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index b73bb0a897b..52c2d1be430 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -825,6 +825,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::SINT_TO_FP: Res = PromoteIntOp_SINT_TO_FP(N); break; case ISD::STORE: Res = PromoteIntOp_STORE(cast(N), OpNo); break; + case ISD::MSTORE: Res = PromoteIntOp_MSTORE(cast(N), + OpNo); break; + case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast(N), + OpNo); break; case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break; case ISD::FP16_TO_FP: case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break; @@ -1091,6 +1095,25 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){ N->getMemoryVT(), N->getMemOperand()); } +SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){ + + assert(OpNo == 2 && "Only know how to promote the mask!"); + EVT DataVT = N->getOperand(3).getValueType(); + SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[OpNo] = Mask; + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + +SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){ + assert(OpNo == 2 && "Only know how to promote the mask!"); + EVT DataVT = N->getValueType(0); + SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[OpNo] = Mask; + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 30f412ba317..805b0fc0463 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -285,6 +285,8 @@ private: SDValue PromoteIntOp_TRUNCATE(SDNode *N); SDValue PromoteIntOp_UINT_TO_FP(SDNode *N); SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); + SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -578,6 +580,7 @@ private: void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -594,6 +597,7 @@ private: SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); + SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N); SDValue SplitVecOp_TRUNCATE(SDNode *N); SDValue SplitVecOp_VSETCC(SDNode *N); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 27f63d27823..88f67370228 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -597,6 +597,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::LOAD: SplitVecRes_LOAD(cast(N), Lo, Hi); break; + case ISD::MLOAD: + SplitVecRes_MLOAD(cast(N), Lo, Hi); + break; case ISD::SETCC: SplitVecRes_SETCC(N, Lo, Hi); break; @@ -979,6 +982,64 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, ReplaceValueWith(SDValue(LD, 1), Ch); } +void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, + SDValue &Lo, SDValue &Hi) { + EVT LoVT, HiVT; + SDLoc dl(MLD); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0)); + + SDValue Ch = MLD->getChain(); + SDValue Ptr = MLD->getBasePtr(); + SDValue Mask = MLD->getMask(); + unsigned Alignment = MLD->getOriginalAlignment(); + + // if Alignment is equal to the vector size, + // take the half of it for the second part + unsigned SecondHalfAlignment = + (Alignment == MLD->getValueType(0).getSizeInBits()/8) ? + Alignment/2 : Alignment; + + SDValue MaskLo, MaskHi; + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + + EVT MemoryVT = MLD->getMemoryVT(); + EVT LoMemVT, HiMemVT; + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + + SDValue Src0 = MLD->getSrc0(); + SDValue Src0Lo, Src0Hi; + std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); + + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MLD->getPointerInfo(), + MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), + Alignment, MLD->getAAInfo(), MLD->getRanges()); + + Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, MMO); + + unsigned IncrementSize = LoMemVT.getSizeInBits()/8; + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, + DAG.getConstant(IncrementSize, Ptr.getValueType())); + + MMO = DAG.getMachineFunction(). + getMachineMemOperand(MLD->getPointerInfo(), + MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), + SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges()); + + Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, MMO); + + + // Build a factor node to remember that this load is independent of the + // other one. + Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Legalized the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(MLD, 1), Ch); + +} + void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && @@ -1234,6 +1295,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::STORE: Res = SplitVecOp_STORE(cast(N), OpNo); break; + case ISD::MSTORE: + Res = SplitVecOp_MSTORE(cast(N), OpNo); + break; case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); break; @@ -1395,6 +1459,56 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { MachinePointerInfo(), EltVT, false, false, false, 0); } +SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, + unsigned OpNo) { + SDValue Ch = N->getChain(); + SDValue Ptr = N->getBasePtr(); + SDValue Mask = N->getMask(); + SDValue Data = N->getData(); + EVT MemoryVT = N->getMemoryVT(); + unsigned Alignment = N->getOriginalAlignment(); + SDLoc DL(N); + + EVT LoMemVT, HiMemVT; + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + + SDValue DataLo, DataHi; + GetSplitVector(Data, DataLo, DataHi); + SDValue MaskLo, MaskHi; + GetSplitVector(Mask, MaskLo, MaskHi); + + // if Alignment is equal to the vector size, + // take the half of it for the second part + unsigned SecondHalfAlignment = + (Alignment == Data->getValueType(0).getSizeInBits()/8) ? + Alignment/2 : Alignment; + + SDValue Lo, Hi; + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(N->getPointerInfo(), + MachineMemOperand::MOStore, LoMemVT.getStoreSize(), + Alignment, N->getAAInfo(), N->getRanges()); + + Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, MMO); + + unsigned IncrementSize = LoMemVT.getSizeInBits()/8; + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(IncrementSize, Ptr.getValueType())); + + MMO = DAG.getMachineFunction(). + getMachineMemOperand(N->getPointerInfo(), + MachineMemOperand::MOStore, HiMemVT.getStoreSize(), + SecondHalfAlignment, N->getAAInfo(), N->getRanges()); + + Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, MMO); + + + // Build a factor node to remember that this store is independent of the + // other one. + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); + +} + SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) { assert(N->isUnindexed() && "Indexed store of vector?"); assert(OpNo == 1 && "Can only split the stored value"); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 7961e66d8c8..57ec81bba4a 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4917,6 +4917,60 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base, return SDValue(N, 0); } +SDValue +SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, + SDValue Ptr, SDValue Mask, SDValue Src0, + MachineMemOperand *MMO) { + + SDVTList VTs = getVTList(VT, MVT::Other); + SDValue Ops[] = { Chain, Ptr, Mask, Src0 }; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops); + ID.AddInteger(VT.getRawBits()); + ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED, + MMO->isVolatile(), + MMO->isNonTemporal(), + MMO->isInvariant())); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + void *IP = nullptr; + if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { + cast(E)->refineAlignment(MMO); + return SDValue(E, 0); + } + SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(), + dl.getDebugLoc(), Ops, 4, VTs, + VT, MMO); + CSEMap.InsertNode(N, IP); + InsertNode(N); + return SDValue(N, 0); +} + +SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val, + SDValue Ptr, SDValue Mask, MachineMemOperand *MMO) { + assert(Chain.getValueType() == MVT::Other && + "Invalid chain type"); + EVT VT = Val.getValueType(); + SDVTList VTs = getVTList(MVT::Other); + SDValue Ops[] = { Chain, Ptr, Mask, Val }; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops); + ID.AddInteger(VT.getRawBits()); + ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(), + MMO->isNonTemporal(), MMO->isInvariant())); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + void *IP = nullptr; + if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) { + cast(E)->refineAlignment(MMO); + return SDValue(E, 0); + } + SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(), + dl.getDebugLoc(), Ops, 4, + VTs, VT, MMO); + CSEMap.InsertNode(N, IP); + InsertNode(N); + return SDValue(N, 0); +} + SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, SDValue SV, diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index eeecbf11bad..c3b197e97eb 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3617,6 +3617,70 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { DAG.setRoot(StoreNode); } +void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) { + SDLoc sdl = getCurSDLoc(); + + Value *PtrOperand = I.getArgOperand(0); + SDValue Ptr = getValue(PtrOperand); + SDValue Src0 = getValue(I.getArgOperand(1)); + SDValue Mask = getValue(I.getArgOperand(3)); + EVT VT = Src0.getValueType(); + unsigned Alignment = (cast(I.getArgOperand(2)))->getZExtValue(); + if (!Alignment) + Alignment = DAG.getEVTAlignment(VT); + + AAMDNodes AAInfo; + I.getAAMetadata(AAInfo); + + MachineMemOperand *MMO = + DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(PtrOperand), + MachineMemOperand::MOStore, VT.getStoreSize(), + Alignment, AAInfo); + SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, MMO); + DAG.setRoot(StoreNode); + setValue(&I, StoreNode); +} + +void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) { + SDLoc sdl = getCurSDLoc(); + + Value *PtrOperand = I.getArgOperand(0); + SDValue Ptr = getValue(PtrOperand); + SDValue Src0 = getValue(I.getArgOperand(1)); + SDValue Mask = getValue(I.getArgOperand(3)); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = TLI.getValueType(I.getType()); + unsigned Alignment = (cast(I.getArgOperand(2)))->getZExtValue(); + if (!Alignment) + Alignment = DAG.getEVTAlignment(VT); + + AAMDNodes AAInfo; + I.getAAMetadata(AAInfo); + const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range); + + SDValue InChain = DAG.getRoot(); + if (AA->pointsToConstantMemory( + AliasAnalysis::Location(PtrOperand, + AA->getTypeStoreSize(I.getType()), + AAInfo))) { + // Do not serialize (non-volatile) loads of constant memory with anything. + InChain = DAG.getEntryNode(); + } + + MachineMemOperand *MMO = + DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(PtrOperand), + MachineMemOperand::MOLoad, VT.getStoreSize(), + Alignment, AAInfo, Ranges); + + SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, MMO); + SDValue OutChain = Load.getValue(1); + DAG.setRoot(OutChain); + setValue(&I, Load); +} + void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) { SDLoc dl = getCurSDLoc(); AtomicOrdering SuccessOrder = I.getSuccessOrdering(); @@ -4918,6 +4982,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return nullptr; } + case Intrinsic::masked_load: + visitMaskedLoad(I); + return nullptr; + case Intrinsic::masked_store: + visitMaskedStore(I); + return nullptr; case Intrinsic::x86_mmx_pslli_w: case Intrinsic::x86_mmx_pslli_d: case Intrinsic::x86_mmx_pslli_q: diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 2ae1e96a63e..b2517b93e45 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -769,6 +769,8 @@ private: void visitAlloca(const AllocaInst &I); void visitLoad(const LoadInst &I); void visitStore(const StoreInst &I); + void visitMaskedLoad(const CallInst &I); + void visitMaskedStore(const CallInst &I); void visitAtomicCmpXchg(const AtomicCmpXchgInst &I); void visitAtomicRMW(const AtomicRMWInst &I); void visitFence(const FenceInst &I); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index c9f6cff9155..e8577d898c2 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -269,6 +269,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { // Other operators case ISD::LOAD: return "load"; case ISD::STORE: return "store"; + case ISD::MLOAD: return "masked_load"; + case ISD::MSTORE: return "masked_store"; case ISD::VAARG: return "vaarg"; case ISD::VACOPY: return "vacopy"; case ISD::VAEND: return "vaend"; diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp index c7d3d149e0c..ba6e4ce79a1 100644 --- a/lib/IR/Function.cpp +++ b/lib/IR/Function.cpp @@ -551,7 +551,8 @@ enum IIT_Info { IIT_ANYPTR = 26, IIT_V1 = 27, IIT_VARARG = 28, - IIT_HALF_VEC_ARG = 29 + IIT_HALF_VEC_ARG = 29, + IIT_SAME_VEC_WIDTH_ARG = 30 }; @@ -659,6 +660,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef Infos, ArgInfo)); return; } + case IIT_SAME_VEC_WIDTH_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument, + ArgInfo)); + return; + } case IIT_EMPTYSTRUCT: OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0)); return; @@ -766,7 +773,14 @@ static Type *DecodeFixedType(ArrayRef &Infos, case IITDescriptor::HalfVecArgument: return VectorType::getHalfElementsVectorType(cast( Tys[D.getArgumentNumber()])); - } + case IITDescriptor::SameVecWidthArgument: + Type *EltTy = DecodeFixedType(Infos, Tys, Context); + Type *Ty = Tys[D.getArgumentNumber()]; + if (VectorType *VTy = dyn_cast(Ty)) { + return VectorType::get(EltTy, VTy->getNumElements()); + } + llvm_unreachable("unhandled"); + } llvm_unreachable("unhandled"); } diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp index a4c5d9766a2..5f63ded48fe 100644 --- a/lib/IR/IRBuilder.cpp +++ b/lib/IR/IRBuilder.cpp @@ -183,3 +183,29 @@ CallInst *IRBuilderBase::CreateAssumption(Value *Cond) { return createCallHelper(FnAssume, Ops, this); } +/// Create a call to a Masked Load intrinsic. +/// Ops - an array of operands. +CallInst *IRBuilderBase::CreateMaskedLoad(ArrayRef Ops) { + // The only one overloaded type - the type of passthru value in this case + Type *DataTy = Ops[1]->getType(); + return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy); +} + +/// Create a call to a Masked Store intrinsic. +/// Ops - an array of operands. +CallInst *IRBuilderBase::CreateMaskedStore(ArrayRef Ops) { + // DataTy - type of the data to be stored - the only one overloaded type + Type *DataTy = Ops[1]->getType(); + return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, DataTy); +} + +/// Create a call to a Masked intrinsic, with given intrinsic Id, +/// an array of operands - Ops, and one overloaded type - DataTy +CallInst *IRBuilderBase::CreateMaskedIntrinsic(unsigned Id, + ArrayRef Ops, + Type *DataTy) { + Module *M = BB->getParent()->getParent(); + Type *OverloadedTypes[] = { DataTy }; + Value *TheFn = Intrinsic::getDeclaration(M, (Intrinsic::ID)Id, OverloadedTypes); + return createCallHelper(TheFn, Ops, this); +} diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index 976786d047c..c025a958385 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -2406,6 +2406,19 @@ bool Verifier::VerifyIntrinsicType(Type *Ty, !isa(ArgTys[D.getArgumentNumber()]) || VectorType::getHalfElementsVectorType( cast(ArgTys[D.getArgumentNumber()])) != Ty; + case IITDescriptor::SameVecWidthArgument: { + if (D.getArgumentNumber() >= ArgTys.size()) + return true; + VectorType * ReferenceType = + dyn_cast(ArgTys[D.getArgumentNumber()]); + VectorType *ThisArgType = dyn_cast(Ty); + if (!ThisArgType || !ReferenceType || + (ReferenceType->getVectorNumElements() != + ThisArgType->getVectorNumElements())) + return true; + return VerifyIntrinsicType(ThisArgType->getVectorElementType(), + Infos, ArgTys); + } } llvm_unreachable("unhandled"); } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c21e6146115..cdb691248ef 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1319,13 +1319,21 @@ void X86TargetLowering::resetOperationActions() { // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. - if (VT.is128BitVector()) + if (VT.is128BitVector()) { + if (VT.getScalarSizeInBits() >= 32) { + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Custom); + } setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - + } // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; + if (VT.getScalarSizeInBits() >= 32) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); @@ -1492,9 +1500,13 @@ void X86TargetLowering::resetOperationActions() { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); // Extract subvector is special because the value type // (result) is 256/128-bit but the source is 512-bit wide. - if (VT.is128BitVector() || VT.is256BitVector()) + if (VT.is128BitVector() || VT.is256BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - + if ( EltSize >= 32) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } + } if (VT.getVectorElementType() == MVT::i1) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); @@ -1510,6 +1522,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); } } for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 273b4add5e7..1b24163f44b 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2122,6 +2122,41 @@ def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), + (VMOVUPSZmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)), + (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)), + (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, + (bc_v16f32 (v16i32 immAllZerosV)))), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))), + (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8f64 (v16i32 immAllZerosV)))), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))), + (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", "16", "8", "4", SSEPackedInt, HasAVX512>, avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", @@ -2196,6 +2231,46 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))), + (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8i64 (v16i32 immAllZerosV)))), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))), + (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)), + (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)), + (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +// SKX replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>; + +// KNL replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Zmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + + // Move Int Doubleword to Packed Double Int // def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 18ba3b45197..547e3835a72 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -9260,6 +9260,61 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", int_x86_avx2_maskstore_q, int_x86_avx2_maskstore_q_256>, VEX_W; +def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), + (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), + (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), + (bc_v8f32 (v8i32 immAllZerosV)))), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))), + (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))), + (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), + (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), + (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), + (v4f64 immAllZerosV))), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))), + (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), + (bc_v4i64 (v8i32 immAllZerosV)))), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))), + (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + //===----------------------------------------------------------------------===// // Variable Bit Shifts diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 9f9fb350bdc..bed78ac8ab9 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -111,6 +111,8 @@ public: Type *Ty) const override; unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) const override; + bool isLegalPredicatedLoad (Type *DataType, int Consecutive) const override; + bool isLegalPredicatedStore(Type *DataType, int Consecutive) const override; /// @} }; @@ -1156,3 +1158,19 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, } return X86TTI::getIntImmCost(Imm, Ty); } + +bool X86TTI::isLegalPredicatedLoad(Type *DataType, int Consecutive) const { + int ScalarWidth = DataType->getScalarSizeInBits(); + + // Todo: AVX512 allows gather/scatter, works with strided and random as well + if ((ScalarWidth < 32) || (Consecutive == 0)) + return false; + if (ST->hasAVX512() || ST->hasAVX2()) + return true; + return false; +} + +bool X86TTI::isLegalPredicatedStore(Type *DataType, int Consecutive) const { + return isLegalPredicatedLoad(DataType, Consecutive); +} + diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll new file mode 100644 index 00000000000..8cb2d63d5f6 --- /dev/null +++ b/test/CodeGen/X86/masked_memop.ll @@ -0,0 +1,73 @@ +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 + +; AVX512-LABEL: test1 +; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} + +; AVX2-LABEL: test1 +; AVX2: vpmaskmovd 32(%rdi) +; AVX2: vpmaskmovd (%rdi) +; AVX2-NOT: blend + +define <16 x i32> @test1(<16 x i32> %trigger, i8* %addr) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + %res = call <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32>undef, i32 4, <16 x i1>%mask) + ret <16 x i32> %res +} + +; AVX512-LABEL: test2 +; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} + +; AVX2-LABEL: test2 +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2-NOT: blend +define <16 x i32> @test2(<16 x i32> %trigger, i8* %addr) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + %res = call <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32>zeroinitializer, i32 4, <16 x i1>%mask) + ret <16 x i32> %res +} + +; AVX512-LABEL: test3 +; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1} + +define void @test3(<16 x i32> %trigger, i8* %addr, <16 x i32> %val) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v16i32(i8* %addr, <16 x i32>%val, i32 4, <16 x i1>%mask) + ret void +} + +; AVX512-LABEL: test4 +; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}} + +; AVX2-LABEL: test4 +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2: blend +define <16 x float> @test4(<16 x i32> %trigger, i8* %addr, <16 x float> %dst) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + %res = call <16 x float> @llvm.masked.load.v16f32(i8* %addr, <16 x float>%dst, i32 4, <16 x i1>%mask) + ret <16 x float> %res +} + +; AVX512-LABEL: test5 +; AVX512: vmovupd (%rdi), %zmm1 {%k1} + +; AVX2-LABEL: test5 +; AVX2: vpmaskmovq +; AVX2: vblendvpd +; AVX2: vpmaskmovq +; AVX2: vblendvpd +define <8 x double> @test5(<8 x i32> %trigger, i8* %addr, <8 x double> %dst) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x double> @llvm.masked.load.v8f64(i8* %addr, <8 x double>%dst, i32 4, <8 x i1>%mask) + ret <8 x double> %res +} + +declare <16 x i32> @llvm.masked.load.v16i32(i8*, <16 x i32>, i32, <16 x i1>) +declare void @llvm.masked.store.v16i32(i8*, <16 x i32>, i32, <16 x i1>) +declare <16 x float> @llvm.masked.load.v16f32(i8*, <16 x float>, i32, <16 x i1>) +declare void @llvm.masked.store.v16f32(i8*, <16 x float>, i32, <16 x i1>) +declare <8 x double> @llvm.masked.load.v8f64(i8*, <8 x double>, i32, <8 x i1>) +declare void @llvm.masked.store.v8f64(i8*, <8 x double>, i32, <8 x i1>) + diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp index 62938f7027b..49e13160a56 100644 --- a/utils/TableGen/CodeGenTarget.cpp +++ b/utils/TableGen/CodeGenTarget.cpp @@ -534,7 +534,8 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) { // variants with iAny types; otherwise, if the intrinsic is not // overloaded, all the types can be specified directly. assert(((!TyEl->isSubClassOf("LLVMExtendedType") && - !TyEl->isSubClassOf("LLVMTruncatedType")) || + !TyEl->isSubClassOf("LLVMTruncatedType") && + !TyEl->isSubClassOf("LLVMVectorSameWidth")) || VT == MVT::iAny || VT == MVT::vAny) && "Expected iAny or vAny type"); } else diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp index 37f6de057da..dcf4b80e4ef 100644 --- a/utils/TableGen/IntrinsicEmitter.cpp +++ b/utils/TableGen/IntrinsicEmitter.cpp @@ -257,7 +257,8 @@ enum IIT_Info { IIT_ANYPTR = 26, IIT_V1 = 27, IIT_VARARG = 28, - IIT_HALF_VEC_ARG = 29 + IIT_HALF_VEC_ARG = 29, + IIT_SAME_VEC_WIDTH_ARG = 30 }; @@ -305,6 +306,13 @@ static void EncodeFixedType(Record *R, std::vector &ArgCodes, Sig.push_back(IIT_TRUNC_ARG); else if (R->isSubClassOf("LLVMHalfElementsVectorType")) Sig.push_back(IIT_HALF_VEC_ARG); + else if (R->isSubClassOf("LLVMVectorSameWidth")) { + Sig.push_back(IIT_SAME_VEC_WIDTH_ARG); + Sig.push_back((Number << 2) | ArgCodes[Number]); + MVT::SimpleValueType VT = getValueType(R->getValueAsDef("ElTy")); + EncodeFixedValueType(VT, Sig); + return; + } else Sig.push_back(IIT_ARG); return Sig.push_back((Number << 2) | ArgCodes[Number]); -- 2.34.1