From: Richard Sandiford Date: Mon, 12 Aug 2013 10:28:10 +0000 (+0000) Subject: [SystemZ] Use CLC and IPM to implement memcmp X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=ac168b8bc8773a083a10902f64e4ae57a925aee4 [SystemZ] Use CLC and IPM to implement memcmp For now this is restricted to fixed-length comparisons with a length in the range [1, 256], as for memcpy() and MVC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188163 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h index fe2fba42dc5..35da1d7a738 100644 --- a/include/llvm/Target/TargetSelectionDAGInfo.h +++ b/include/llvm/Target/TargetSelectionDAGInfo.h @@ -94,6 +94,20 @@ public: MachinePointerInfo DstPtrInfo) const { return SDValue(); } + + /// EmitTargetCodeForMemcmp - Emit target-specific code that performs a + /// memcmp, in cases where that is faster than a libcall. The first + /// returned SDValue is the result of the memcmp and the second is + /// the chain. Both SDValues can be null if a normal libcall should + /// be used. + virtual std::pair + EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Op1, SDValue Op2, + SDValue Op3, MachinePointerInfo Op1PtrInfo, + MachinePointerInfo Op2PtrInfo) const { + return std::make_pair(SDValue(), SDValue()); + } }; } // end llvm namespace diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index e23c5bea224..1101ee1c50b 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -58,6 +58,7 @@ #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" #include using namespace llvm; @@ -5463,6 +5464,26 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { return false; const ConstantInt *Size = dyn_cast(I.getArgOperand(2)); + if (Size && Size->getZExtValue() == 0) { + EVT CallVT = TM.getTargetLowering()->getValueType(I.getType(), true); + setValue(&I, DAG.getConstant(0, CallVT)); + return true; + } + + const Value *Arg0 = I.getArgOperand(0); + const Value *Arg1 = I.getArgOperand(1); + const Value *Arg2 = I.getArgOperand(2); + const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo(); + std::pair Res = + TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(), + getValue(Arg0), getValue(Arg1), getValue(Arg2), + MachinePointerInfo(Arg0), + MachinePointerInfo(Arg1)); + if (Res.first.getNode()) { + setValue(&I, Res.first); + DAG.setRoot(Res.second); + return true; + } // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt index 563513b5f42..eebc4e4572f 100644 --- a/lib/Target/SystemZ/README.txt +++ b/lib/Target/SystemZ/README.txt @@ -67,12 +67,12 @@ condition codes. For example, we could use LCDFR instead of LCDBR. -- We don't optimize block memory operations, except using single MVCs -for memcpy. +for memcpy and single CLCs for memcmp. -It's definitely worth using things like CLC, NC, XC and OC with +It's definitely worth using things like NC, XC and OC with constant lengths. MVCIN may be worthwhile too. -We should probably implement things like memcpy using MVC with EXECUTE. +We should probably implement general memcpy using MVC with EXECUTE. Likewise memcmp and CLC. MVCLE and CLCLE could be useful too. -- diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index a51f0168a9e..899b08c4599 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1702,6 +1702,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(UDIVREM64); OPCODE(MVC); OPCODE(CLC); + OPCODE(IPM); OPCODE(ATOMIC_SWAPW); OPCODE(ATOMIC_LOADW_ADD); OPCODE(ATOMIC_LOADW_SUB); @@ -2240,8 +2241,9 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI, } MachineBasicBlock * -SystemZTargetLowering::emitMVCWrapper(MachineInstr *MI, - MachineBasicBlock *MBB) const { +SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI, + MachineBasicBlock *MBB, + unsigned Opcode) const { const SystemZInstrInfo *TII = TM.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -2251,7 +2253,7 @@ SystemZTargetLowering::emitMVCWrapper(MachineInstr *MI, uint64_t SrcDisp = MI->getOperand(3).getImm(); uint64_t Length = MI->getOperand(4).getImm(); - BuildMI(*MBB, MI, DL, TII->get(SystemZ::MVC)) + BuildMI(*MBB, MI, DL, TII->get(Opcode)) .addOperand(DestBase).addImm(DestDisp).addImm(Length) .addOperand(SrcBase).addImm(SrcDisp); @@ -2483,7 +2485,9 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const { case SystemZ::ATOMIC_CMP_SWAPW: return emitAtomicCmpSwapW(MI, MBB); case SystemZ::MVCWrapper: - return emitMVCWrapper(MI, MBB); + return emitMemMemWrapper(MI, MBB, SystemZ::MVC); + case SystemZ::CLCWrapper: + return emitMemMemWrapper(MI, MBB, SystemZ::CLC); default: llvm_unreachable("Unexpected instr type to insert"); } diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 4098ff34b38..0036ce84aa7 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -84,6 +84,9 @@ namespace SystemZISD { // as for MVC. CLC, + // Store the CC value in bits 29 and 28 of an integer. + IPM, + // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or // ATOMIC_LOAD_. // @@ -234,8 +237,9 @@ private: unsigned BitSize) const; MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr *MI, MachineBasicBlock *BB) const; - MachineBasicBlock *emitMVCWrapper(MachineInstr *MI, - MachineBasicBlock *BB) const; + MachineBasicBlock *emitMemMemWrapper(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Opcode) const; }; } // end namespace llvm diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 9ee60aa80c3..54a86693599 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -293,6 +293,99 @@ SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, return Count; } +bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI, + unsigned &SrcReg, unsigned &SrcReg2, + int &Mask, int &Value) const { + assert(MI->isCompare() && "Caller should have checked for a comparison"); + + if (MI->getNumExplicitOperands() == 2 && + MI->getOperand(0).isReg() && + MI->getOperand(1).isImm()) { + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = 0; + Value = MI->getOperand(1).getImm(); + Mask = ~0; + return true; + } + + return false; +} + +// If Reg is a virtual register that is used by only a single non-debug +// instruction, return the defining instruction, otherwise return null. +static MachineInstr *getDefSingleUse(const MachineRegisterInfo *MRI, + unsigned Reg) { + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return 0; + + MachineRegisterInfo::use_nodbg_iterator I = MRI->use_nodbg_begin(Reg); + MachineRegisterInfo::use_nodbg_iterator E = MRI->use_nodbg_end(); + if (I == E || llvm::next(I) != E) + return 0; + + return MRI->getUniqueVRegDef(Reg); +} + +// Return true if MI is a shift of type Opcode by Imm bits. +static bool isShift(MachineInstr *MI, int Opcode, int64_t Imm) { + return (MI->getOpcode() == Opcode && + !MI->getOperand(2).getReg() && + MI->getOperand(3).getImm() == Imm); +} + +// Compare compares SrcReg against zero. Check whether SrcReg contains +// the result of an IPM sequence that is only used by Compare. Try to +// delete both of them if so and return true if a change was made. +static bool removeIPM(MachineInstr *Compare, unsigned SrcReg, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI) { + MachineInstr *SRA = getDefSingleUse(MRI, SrcReg); + if (!SRA || !isShift(SRA, SystemZ::SRA, 30)) + return false; + + MachineInstr *SLL = getDefSingleUse(MRI, SRA->getOperand(1).getReg()); + if (!SLL || !isShift(SLL, SystemZ::SLL, 2)) + return false; + + MachineInstr *IPM = getDefSingleUse(MRI, SLL->getOperand(1).getReg()); + if (!IPM || IPM->getOpcode() != SystemZ::IPM) + return false; + + // Check that there are no assignments to CC between the IPM and Compare, + // except for the SRA that we'd like to delete. We can ignore SLL because + // it does not assign to CC. We can also ignore uses of the SRA CC result, + // since it is effectively restoring CC to the value it had before IPM + // (for all current use cases). + if (IPM->getParent() != Compare->getParent()) + return false; + MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare; + for (++MBBI; MBBI != MBBE; ++MBBI) { + MachineInstr *MI = MBBI; + if (MI != SRA && MI->modifiesRegister(SystemZ::CC, TRI)) + return false; + } + + IPM->eraseFromParent(); + SLL->eraseFromParent(); + SRA->eraseFromParent(); + Compare->eraseFromParent(); + return true; +} + +bool +SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare, + unsigned SrcReg, unsigned SrcReg2, + int Mask, int Value, + const MachineRegisterInfo *MRI) const { + assert(!SrcReg2 && "Only optimizing constant comparisons so far"); + bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0; + if (Value == 0 && + !IsLogical && + removeIPM(Compare, SrcReg, MRI, TM.getRegisterInfo())) + return true; + return false; +} + // If Opcode is a move that has a conditional variant, return that variant, // otherwise return 0. static unsigned getConditionalMove(unsigned Opcode) { diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h index 276fd3b7a1b..3c4e8af0e61 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/lib/Target/SystemZ/SystemZInstrInfo.h @@ -129,6 +129,12 @@ public: MachineBasicBlock *FBB, const SmallVectorImpl &Cond, DebugLoc DL) const LLVM_OVERRIDE; + bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + unsigned &SrcReg2, int &Mask, int &Value) const + LLVM_OVERRIDE; + bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + unsigned SrcReg2, int Mask, int Value, + const MachineRegisterInfo *MRI) const LLVM_OVERRIDE; virtual bool isPredicable(MachineInstr *MI) const LLVM_OVERRIDE; virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index a7181d68a7f..834ffedcf3f 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -1117,7 +1117,7 @@ let Defs = [CC] in { // Extract CC into bits 29 and 28 of a register. let Uses = [CC] in - def IPM : InherentRRE<"ipm", 0xB222, GR32, (null_frag)>; + def IPM : InherentRRE<"ipm", 0xB222, GR32, (z_ipm)>; // Read a 32-bit access register into a GR32. As with all GR32 operations, // the upper 32 bits of the enclosing GR64 remain unchanged, which is useful diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index dae04de02b7..8a5b909eb6d 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -58,6 +58,7 @@ def SDT_ZMemMemLength : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>; +def SDT_ZI32Intrinsic : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>; //===----------------------------------------------------------------------===// // Node definitions @@ -112,7 +113,9 @@ def z_atomic_cmp_swapw : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>; def z_mvc : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength, [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength, - [SDNPHasChain, SDNPMayLoad]>; + [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>; +def z_ipm : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic, + [SDNPInGlue]>; //===----------------------------------------------------------------------===// // Pattern fragments diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index 4ca9292092d..341dc946550 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -125,3 +125,30 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain, } return SDValue(); } + +std::pair SystemZSelectionDAGInfo:: +EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain, + SDValue Src1, SDValue Src2, SDValue Size, + MachinePointerInfo Op1PtrInfo, + MachinePointerInfo Op2PtrInfo) const { + if (ConstantSDNode *CSize = dyn_cast(Size)) { + uint64_t Bytes = CSize->getZExtValue(); + if (Bytes >= 1 && Bytes <= 0x100) { + // A single CLC. + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, + Src1, Src2, Size); + SDValue Glue = Chain.getValue(1); + // IPM inserts the CC value into bits 29 and 28, with 0 meaning "equal", + // 1 meaning "greater" and 2 meaning "less". Convert them into an + // integer that is respectively equal, greater or less than 0. + SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue); + SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, IPM, + DAG.getConstant(2, MVT::i32)); + SDValue SRA = DAG.getNode(ISD::SRA, DL, MVT::i32, SHL, + DAG.getConstant(30, MVT::i32)); + return std::make_pair(SRA, Chain); + } + } + return std::make_pair(SDValue(), SDValue()); +} diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h index 9138a9cc082..c757e167071 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h @@ -38,7 +38,13 @@ public: EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain, SDValue Dst, SDValue Byte, SDValue Size, unsigned Align, bool IsVolatile, - MachinePointerInfo DstPtrInfo) const; + MachinePointerInfo DstPtrInfo) const LLVM_OVERRIDE; + + virtual std::pair + EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain, + SDValue Src1, SDValue Src2, SDValue Size, + MachinePointerInfo Op1PtrInfo, + MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE; }; } diff --git a/test/CodeGen/SystemZ/memcmp-01.ll b/test/CodeGen/SystemZ/memcmp-01.ll new file mode 100644 index 00000000000..37477695b78 --- /dev/null +++ b/test/CodeGen/SystemZ/memcmp-01.ll @@ -0,0 +1,134 @@ +; Test memcmp using CLC. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +declare signext i32 @memcmp(i8 *%src1, i8 *%src2, i64 %size) + +; Zero-length comparisons should be optimized away. +define i32 @f1(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f1: +; CHECK: lhi %r2, 0 +; CHECK: br %r14 + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 0) + ret i32 %res +} + +; Check a case where the result is used as an integer. +define i32 @f2(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f2: +; CHECK: clc 0(2,%r2), 0(%r3) +; CHECK: ipm %r2 +; CHECK: sll %r2, 2 +; CHECK: sra %r2, 30 +; CHECK: br %r14 + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 2) + ret i32 %res +} + +; Check a case where the result is tested for equality. +define void @f3(i8 *%src1, i8 *%src2, i32 *%dest) { +; CHECK-LABEL: f3: +; CHECK: clc 0(3,%r2), 0(%r3) +; CHECK-NEXT: je {{\..*}} +; CHECK: br %r14 + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 3) + %cmp = icmp eq i32 %res, 0 + br i1 %cmp, label %exit, label %store + +store: + store i32 0, i32 *%dest + br label %exit + +exit: + ret void +} + +; Check a case where the result is tested for inequality. +define void @f4(i8 *%src1, i8 *%src2, i32 *%dest) { +; CHECK-LABEL: f4: +; CHECK: clc 0(4,%r2), 0(%r3) +; CHECK-NEXT: jlh {{\..*}} +; CHECK: br %r14 +entry: + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 4) + %cmp = icmp ne i32 %res, 0 + br i1 %cmp, label %exit, label %store + +store: + store i32 0, i32 *%dest + br label %exit + +exit: + ret void +} + +; Check a case where the result is tested via slt. +define void @f5(i8 *%src1, i8 *%src2, i32 *%dest) { +; CHECK-LABEL: f5: +; CHECK: clc 0(5,%r2), 0(%r3) +; CHECK-NEXT: jl {{\..*}} +; CHECK: br %r14 +entry: + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 5) + %cmp = icmp slt i32 %res, 0 + br i1 %cmp, label %exit, label %store + +store: + store i32 0, i32 *%dest + br label %exit + +exit: + ret void +} + +; Check a case where the result is tested for sgt. +define void @f6(i8 *%src1, i8 *%src2, i32 *%dest) { +; CHECK-LABEL: f6: +; CHECK: clc 0(6,%r2), 0(%r3) +; CHECK-NEXT: jh {{\..*}} +; CHECK: br %r14 +entry: + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 6) + %cmp = icmp sgt i32 %res, 0 + br i1 %cmp, label %exit, label %store + +store: + store i32 0, i32 *%dest + br label %exit + +exit: + ret void +} + +; Check the upper end of the CLC range. Here the result is used both as +; an integer and for branching, but it's better to branch on the result +; of the SRA. +define i32 @f7(i8 *%src1, i8 *%src2, i32 *%dest) { +; CHECK-LABEL: f7: +; CHECK: clc 0(256,%r2), 0(%r3) +; CHECK: ipm %r2 +; CHECK: sll %r2, 2 +; CHECK: sra %r2, 30 +; CHECK: jl {{.L*}} +; CHECK: br %r14 +entry: + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 256) + %cmp = icmp slt i32 %res, 0 + br i1 %cmp, label %exit, label %store + +store: + store i32 0, i32 *%dest + br label %exit + +exit: + ret i32 %res +} + +; 257 bytes is too big for a single CLC. For now expect a call instead. +define i32 @f8(i8 *%src1, i8 *%src2) { +; CHECK-LABEL: f8: +; CHECK: brasl %r14, memcmp@PLT +; CHECK: br %r14 + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257) + ret i32 %res +}