[SystemZ] Extend memcpy and memset support to all constant lengths

author Richard Sandiford <rsandifo@linux.vnet.ibm.com>

Tue, 27 Aug 2013 09:54:29 +0000 (09:54 +0000)

committer Richard Sandiford <rsandifo@linux.vnet.ibm.com>

Tue, 27 Aug 2013 09:54:29 +0000 (09:54 +0000)
author Richard Sandiford <rsandifo@linux.vnet.ibm.com>
Tue, 27 Aug 2013 09:54:29 +0000 (09:54 +0000)
committer Richard Sandiford <rsandifo@linux.vnet.ibm.com>
Tue, 27 Aug 2013 09:54:29 +0000 (09:54 +0000)
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp

index 20afab782a90ce8fc11381fa1ddfe633890c2d4b..216ca3450c57160af60880882633d41b04e05d2d 100644 (file)
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1917,7 +1917,9 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
      OPCODE(UDIVREM32);
      OPCODE(UDIVREM64);
      OPCODE(MVC);
+    OPCODE(MVC_LOOP);
      OPCODE(CLC);
+    OPCODE(CLC_LOOP);
      OPCODE(STRCMP);
      OPCODE(STPCPY);
      OPCODE(SEARCH_STRING);
@@ -1952,18 +1954,31 @@ static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
    return NewMBB;
  }
  
-// Split MBB after MI and return the new block (the one that contains
-// instructions after MI).
-static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
-                                          MachineBasicBlock *MBB) {
+// Split MBB before MI and return the new block (the one that contains MI).
+static MachineBasicBlock *splitBlockBefore(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
    MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
-  NewMBB->splice(NewMBB->begin(), MBB,
-                 llvm::next(MachineBasicBlock::iterator(MI)),
-                 MBB->end());
+  NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
    NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
    return NewMBB;
  }
  
+// Force base value Base into a register before MI.  Return the register.
+static unsigned forceReg(MachineInstr *MI, MachineOperand &Base,
+                         const SystemZInstrInfo *TII) {
+  if (Base.isReg())
+    return Base.getReg();
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LA), Reg)
+    .addOperand(Base).addImm(0).addReg(0);
+  return Reg;
+}
+
  // Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
  MachineBasicBlock *
  SystemZTargetLowering::emitSelect(MachineInstr *MI,
@@ -1978,7 +1993,7 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
    DebugLoc DL       = MI->getDebugLoc();
  
    MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *JoinMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
    MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
  
    //  StartMBB:
@@ -1999,7 +2014,7 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
    //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
    //  ...
    MBB = JoinMBB;
-  BuildMI(*MBB, MBB->begin(), DL, TII->get(SystemZ::PHI), DestReg)
+  BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg)
      .addReg(TrueReg).addMBB(StartMBB)
      .addReg(FalseReg).addMBB(FalseMBB);
  
@@ -2046,7 +2061,7 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
      CCMask ^= CCValid;
  
    MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *JoinMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
    MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
  
    //  StartMBB:
@@ -2122,7 +2137,7 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
  
    // Insert a basic block for the main loop.
    MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
    MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
  
    //  StartMBB:
@@ -2244,7 +2259,7 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
  
    // Insert 3 basic blocks for the loop.
    MachineBasicBlock *StartMBB  = MBB;
-  MachineBasicBlock *DoneMBB   = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB   = splitBlockBefore(MI, MBB);
    MachineBasicBlock *LoopMBB   = emitBlockAfter(StartMBB);
    MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
    MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
@@ -2351,7 +2366,7 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
  
    // Insert 2 basic blocks for the loop.
    MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
    MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
    MachineBasicBlock *SetMBB   = emitBlockAfter(LoopMBB);
  
@@ -2465,17 +2480,126 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
                                           MachineBasicBlock *MBB,
                                           unsigned Opcode) const {
    const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
    DebugLoc DL = MI->getDebugLoc();
  
-  MachineOperand DestBase = MI->getOperand(0);
+  MachineOperand DestBase = earlyUseOperand(MI->getOperand(0));
    uint64_t       DestDisp = MI->getOperand(1).getImm();
-  MachineOperand SrcBase  = MI->getOperand(2);
+  MachineOperand SrcBase  = earlyUseOperand(MI->getOperand(2));
    uint64_t       SrcDisp  = MI->getOperand(3).getImm();
    uint64_t       Length   = MI->getOperand(4).getImm();
  
-  BuildMI(*MBB, MI, DL, TII->get(Opcode))
-    .addOperand(DestBase).addImm(DestDisp).addImm(Length)
-    .addOperand(SrcBase).addImm(SrcDisp);
+  // Check for the loop form, in which operand 5 is the trip count.
+  if (MI->getNumExplicitOperands() > 5) {
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+
+    uint64_t StartCountReg = MI->getOperand(5).getReg();
+    uint64_t StartSrcReg   = forceReg(MI, SrcBase, TII);
+    uint64_t StartDestReg  = (HaveSingleBase ? StartSrcReg :
+                              forceReg(MI, DestBase, TII));
+
+    const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
+    uint64_t ThisSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg :
+                            MRI.createVirtualRegister(RC));
+    uint64_t NextSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg :
+                            MRI.createVirtualRegister(RC));
+
+    RC = &SystemZ::GR64BitRegClass;
+    uint64_t ThisCountReg = MRI.createVirtualRegister(RC);
+    uint64_t NextCountReg = MRI.createVirtualRegister(RC);
+
+    MachineBasicBlock *StartMBB = MBB;
+    MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+    MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+
+    //  StartMBB:
+    //   # fall through to LoopMMB
+    MBB->addSuccessor(LoopMBB);
+
+    //  LoopMBB:
+    //   %ThisDestReg = phi [ %StartDestReg, StartMBB ],
+    //                      [ %NextDestReg, LoopMBB ]
+    //   %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
+    //                     [ %NextSrcReg, LoopMBB ]
+    //   %ThisCountReg = phi [ %StartCountReg, StartMBB ],
+    //                       [ %NextCountReg, LoopMBB ]
+    //   PFD 2, 768+DestDisp(%ThisDestReg)
+    //   Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
+    //   %NextDestReg = LA 256(%ThisDestReg)
+    //   %NextSrcReg = LA 256(%ThisSrcReg)
+    //   %NextCountReg = AGHI %ThisCountReg, -1
+    //   CGHI %NextCountReg, 0
+    //   JLH LoopMBB
+    //   # fall through to DoneMMB
+    //
+    // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
+    MBB = LoopMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
+      .addReg(StartDestReg).addMBB(StartMBB)
+      .addReg(NextDestReg).addMBB(LoopMBB);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
+        .addReg(StartSrcReg).addMBB(StartMBB)
+        .addReg(NextSrcReg).addMBB(LoopMBB);
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
+      .addReg(StartCountReg).addMBB(StartMBB)
+      .addReg(NextCountReg).addMBB(LoopMBB);
+    BuildMI(MBB, DL, TII->get(SystemZ::PFD))
+      .addImm(SystemZ::PFD_WRITE)
+      .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
+    BuildMI(MBB, DL, TII->get(Opcode))
+      .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
+      .addReg(ThisSrcReg).addImm(SrcDisp);
+    BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
+      .addReg(ThisDestReg).addImm(256).addReg(0);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
+        .addReg(ThisSrcReg).addImm(256).addReg(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
+      .addReg(ThisCountReg).addImm(-1);
+    BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+      .addReg(NextCountReg).addImm(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+      .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+      .addMBB(LoopMBB);
+    MBB->addSuccessor(LoopMBB);
+    MBB->addSuccessor(DoneMBB);
+
+    DestBase = MachineOperand::CreateReg(NextDestReg, false);
+    SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
+    Length &= 255;
+    MBB = DoneMBB;
+  }
+  // Handle any remaining bytes with straight-line code.
+  while (Length > 0) {
+    uint64_t ThisLength = std::min(Length, uint64_t(256));
+    // The previous iteration might have created out-of-range displacements.
+    // Apply them using LAY if so.
+    if (!isUInt<12>(DestDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+        .addOperand(DestBase).addImm(DestDisp).addReg(0);
+      DestBase = MachineOperand::CreateReg(Reg, false);
+      DestDisp = 0;
+    }
+    if (!isUInt<12>(SrcDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+        .addOperand(SrcBase).addImm(SrcDisp).addReg(0);
+      SrcBase = MachineOperand::CreateReg(Reg, false);
+      SrcDisp = 0;
+    }
+    BuildMI(*MBB, MI, DL, TII->get(Opcode))
+      .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
+      .addOperand(SrcBase).addImm(SrcDisp);
+    DestDisp += ThisLength;
+    SrcDisp += ThisLength;
+    Length -= ThisLength;
+  }
  
    MI->eraseFromParent();
    return MBB;
@@ -2503,7 +2627,7 @@ SystemZTargetLowering::emitStringWrapper(MachineInstr *MI,
    uint64_t End2Reg  = MRI.createVirtualRegister(RC);
  
    MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
    MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
  
    //  StartMBB:
@@ -2765,9 +2889,11 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
  
    case SystemZ::ATOMIC_CMP_SWAPW:
      return emitAtomicCmpSwapW(MI, MBB);
-  case SystemZ::MVCWrapper:
+  case SystemZ::MVCSequence:
+  case SystemZ::MVCLoop:
      return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
-  case SystemZ::CLCWrapper:
+  case SystemZ::CLCSequence:
+  case SystemZ::CLCLoop:
      return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
    case SystemZ::CLSTLoop:
      return emitStringWrapper(MI, MBB, SystemZ::CLST);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h

index f6a2ce041ec9dcf276034e17e64651cba4fd47d4..9831777873a151118481ce190522dcd596e3a971 100644 (file)
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -74,16 +74,25 @@ namespace SystemZISD {
      UDIVREM32,
      UDIVREM64,
  
-    // Use MVC to copy bytes from one memory location to another.
-    // The first operand is the target address, the second operand is the
-    // source address, and the third operand is the constant length.
+    // Use a series of MVCs to copy bytes from one memory location to another.
+    // The operands are:
+    // - the target address
+    // - the source address
+    // - the constant length
+    //
      // This isn't a memory opcode because we'd need to attach two
      // MachineMemOperands rather than one.
      MVC,
  
+    // Like MVC, but implemented as a loop that handles X*256 bytes
+    // followed by straight-line code to handle the rest (if any).
+    // The value of X is passed as an additional operand.
+    MVC_LOOP,
+
      // Use CLC to compare two blocks of memory, with the same comments
-    // as for MVC.
+    // as for MVC and MVC_LOOP.
      CLC,
+    CLC_LOOP,
  
      // Use an MVST-based sequence to implement stpcpy().
      STPCPY,
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td

index dbe0fb5cac3523ca4c9bc707687b370febe96025..8d0dcb673fcf20473d207c649c1300db2a0f1e2d 100644 (file)
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -86,9 +86,9 @@ def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_high),
  def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_high),
                                    (EXTRACT_SUBREG FP128:$src2, subreg_high))>;
  
-defm LoadStoreF32  : MVCLoadStore<load, store, f32,  MVCWrapper, 4>;
-defm LoadStoreF64  : MVCLoadStore<load, store, f64,  MVCWrapper, 8>;
-defm LoadStoreF128 : MVCLoadStore<load, store, f128, MVCWrapper, 16>;
+defm LoadStoreF32  : MVCLoadStore<load, store, f32,  MVCSequence, 4>;
+defm LoadStoreF64  : MVCLoadStore<load, store, f64,  MVCSequence, 8>;
+defm LoadStoreF128 : MVCLoadStore<load, store, f128, MVCSequence, 16>;
  
  //===----------------------------------------------------------------------===//
  // Load instructions
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td

index a7e18ec681207ee44221fd544e94f17d21bd0d63..7f2f9f8805d64e20fa27ee948b8fcb6b23df01b9 100644 (file)
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -1426,23 +1426,26 @@ class AtomicLoadWBinaryReg<SDPatternOperator operator>
  class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
    : AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
  
-// Define an instruction that operates on two fixed-length blocks of memory.
-// The real instruction uses a bdladdr12onlylen8 for the first operand and a
-// bdaddr12only for the second, with the length of the second operand being
-// implicitly the same as the first.  This arrangement matches the underlying
-// assembly syntax.  However, for instruction selection it's easier to have
-// two normal bdaddr12onlys and a separate length operand, so define a pseudo
-// instruction for that too.
+// Define an instruction that operates on two fixed-length blocks of memory,
+// and associated pseudo instructions for operating on blocks of any size.
+// The Sequence form uses a straight-line sequence of instructions and
+// the Loop form uses a loop of length-256 instructions followed by
+// another instruction to handle the excess.
  multiclass MemorySS<string mnemonic, bits<8> opcode,
-                    SDPatternOperator operator> {
+                    SDPatternOperator sequence, SDPatternOperator loop> {
    def "" : InstSS<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
                                         bdaddr12only:$BD2),
                    mnemonic##"\t$BDL1, $BD2", []>;
-  let usesCustomInserter = 1 in
-    def Wrapper : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                      imm32len8:$length),
-                         [(operator bdaddr12only:$dest, bdaddr12only:$src,
-                                    imm32len8:$length)]>;
+  let usesCustomInserter = 1 in {
+    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                       imm64:$length),
+                           [(sequence bdaddr12only:$dest, bdaddr12only:$src,
+                                      imm64:$length)]>;
+    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                   imm64:$length, GR64:$count256),
+                      [(loop bdaddr12only:$dest, bdaddr12only:$src,
+                             imm64:$length, GR64:$count256)]>;
+  }
  }
  
  // Define an instruction that operates on two strings, both terminated
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td

index 8e1f5ac3c976232a7cd2dc90d51d0a47081f8f98..399b48a3368a6d393fd69f08cb7135b6d06e2bc4 100644 (file)
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -344,25 +344,25 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
  
  // Memory-to-memory moves.
  let mayLoad = 1, mayStore = 1 in
-  defm MVC : MemorySS<"mvc", 0xD2, z_mvc>;
+  defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
  
  // String moves.
  let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0W] in
    defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
  
  defm LoadStore8_32  : MVCLoadStore<anyextloadi8, truncstorei8, i32,
-                                   MVCWrapper, 1>;
+                                   MVCSequence, 1>;
  defm LoadStore16_32 : MVCLoadStore<anyextloadi16, truncstorei16, i32,
-                                   MVCWrapper, 2>;
-defm LoadStore32_32 : MVCLoadStore<load, store, i32, MVCWrapper, 4>;
+                                   MVCSequence, 2>;
+defm LoadStore32_32 : MVCLoadStore<load, store, i32, MVCSequence, 4>;
  
  defm LoadStore8  : MVCLoadStore<anyextloadi8, truncstorei8, i64,
-                                MVCWrapper, 1>;
+                                MVCSequence, 1>;
  defm LoadStore16 : MVCLoadStore<anyextloadi16, truncstorei16, i64,
-                                MVCWrapper, 2>;
+                                MVCSequence, 2>;
  defm LoadStore32 : MVCLoadStore<anyextloadi32, truncstorei32, i64,
-                                MVCWrapper, 4>;
-defm LoadStore64 : MVCLoadStore<load, store, i64, MVCWrapper, 8>;
+                                MVCSequence, 4>;
+defm LoadStore64 : MVCLoadStore<load, store, i64, MVCSequence, 8>;
  
  //===----------------------------------------------------------------------===//
  // Sign extensions
@@ -1028,7 +1028,7 @@ defm : ZXB<z_ucmp, GR64, CLGFR>;
  
  // Memory-to-memory comparison.
  let mayLoad = 1, Defs = [CC] in
-  defm CLC : MemorySS<"clc", 0xD5, z_clc>;
+  defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
  
  // String comparison.
  let mayLoad = 1, Defs = [CC], Uses = [R0W] in
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td

index eb96dba0f2d9053b559c8fa3283cf69dd5ce892e..421e41f11e6327c49a08ab351f166c56ee92abba 100644 (file)
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -219,11 +219,6 @@ def uimm8    : Immediate<i8, [{}], UIMM8, "U8Imm">;
  // i32 immediates
  //===----------------------------------------------------------------------===//
  
-// Immediates for 8-bit lengths.
-def imm32len8 : Immediate<i32, [{
-  return isUInt<8>(N->getZExtValue() - 1);
-}], NOOP_SDNodeXForm, "U32Imm">;
-
  // Immediates for the lower and upper 16 bits of an i32, with the other
  // bits of the i32 being zero.
  def imm32ll16 : Immediate<i32, [{
@@ -358,7 +353,7 @@ def imm64zx32n : Immediate<i64, [{
    return isUInt<32>(-N->getSExtValue());
  }], NEGIMM32, "U32Imm">;
  
-def imm64 : ImmLeaf<i64, [{}]>;
+def imm64 : ImmLeaf<i64, [{}]>, Operand<i64>;
  
  //===----------------------------------------------------------------------===//
  // Floating-point immediates
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td

index e2c43d6e582ad9b4af32e75256c2c570b2ec5885..ff64ea8fa0b2d6c021af07bdf9f73a4a5d3286fb 100644 (file)
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -57,7 +57,12 @@ def SDT_ZAtomicCmpSwapW     : SDTypeProfile<1, 6,
  def SDT_ZMemMemLength       : SDTypeProfile<0, 3,
                                              [SDTCisPtrTy<0>,
                                               SDTCisPtrTy<1>,
-                                             SDTCisVT<2, i32>]>;
+                                             SDTCisVT<2, i64>]>;
+def SDT_ZMemMemLoop         : SDTypeProfile<0, 4,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i64>,
+                                             SDTCisVT<3, i64>]>;
  def SDT_ZString             : SDTypeProfile<1, 3,
                                              [SDTCisPtrTy<0>,
                                               SDTCisPtrTy<1>,
@@ -123,8 +128,12 @@ def z_atomic_cmp_swapw  : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
  
  def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_mvc_loop          : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
  def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
  def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZString,
                                   [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
  def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp

index 638c3ee6f6ff2c34ed4764584ce0a5a912a3d85b..6026b1f6f0e653c39af0caa0be36eff9b6044832 100644 (file)
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -25,6 +25,30 @@ SystemZSelectionDAGInfo(const SystemZTargetMachine &TM)
  SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() {
  }
  
+// Use MVC to copy Size bytes from Src to Dest, deciding whether to use
+// a loop or straight-line code.
+static SDValue emitMVC(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                       SDValue Dst, SDValue Src, uint64_t Size) {
+  EVT PtrVT = Src.getValueType();
+  // The heuristic we use is to prefer loops for anything that would
+  // require 7 or more MVCs.  With these kinds of sizes there isn't
+  // much to choose between straight-line code and looping code,
+  // since the time will be dominated by the MVCs themselves.
+  // However, the loop has 4 or 5 instructions (depending on whether
+  // the base addresses can be proved equal), so there doesn't seem
+  // much point using a loop for 5 * 256 bytes or fewer.  Anything in
+  // the range (5 * 256, 6 * 256) will need another instruction after
+  // the loop, so it doesn't seem worth using a loop then either.
+  // The next value up, 6 * 256, can be implemented in the same
+  // number of straight-line MVCs as 6 * 256 - 1.
+  if (Size > 6 * 256)
+    return DAG.getNode(SystemZISD::MVC_LOOP, DL, MVT::Other, Chain, Dst, Src,
+                       DAG.getConstant(Size, PtrVT),
+                       DAG.getConstant(Size / 256, PtrVT));
+  return DAG.getNode(SystemZISD::MVC, DL, MVT::Other, Chain, Dst, Src,
+                     DAG.getConstant(Size, PtrVT));
+}
+
  SDValue SystemZSelectionDAGInfo::
  EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                          SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
@@ -34,14 +58,8 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
    if (IsVolatile)
      return SDValue();
  
-  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
-    uint64_t Bytes = CSize->getZExtValue();
-    if (Bytes >= 1 && Bytes <= 0x100) {
-      // A single MVC.
-      return DAG.getNode(SystemZISD::MVC, DL, MVT::Other,
-                         Chain, Dst, Src, Size);
-    }
-  }
+  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size))
+    return emitMVC(DAG, DL, Chain, Dst, Src, CSize->getZExtValue());
    return SDValue();
  }
  
@@ -65,7 +83,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                          SDValue Dst, SDValue Byte, SDValue Size,
                          unsigned Align, bool IsVolatile,
                          MachinePointerInfo DstPtrInfo) const {
-  EVT DstVT = Dst.getValueType();
+  EVT PtrVT = Dst.getValueType();
  
    if (IsVolatile)
      return SDValue();
@@ -89,8 +107,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                                       Align, DstPtrInfo);
          if (Size2 == 0)
            return Chain1;
-        Dst = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
-                          DAG.getConstant(Size1, DstVT));
+        Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                          DAG.getConstant(Size1, PtrVT));
          DstPtrInfo = DstPtrInfo.getWithOffset(Size1);
          SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2,
                                       std::min(Align, Size1), DstPtrInfo);
@@ -103,8 +121,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                                        false, false, Align);
          if (Bytes == 1)
            return Chain1;
-        SDValue Dst2 = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
-                                   DAG.getConstant(1, DstVT));
+        SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, PtrVT));
          SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2,
                                        DstPtrInfo.getWithOffset(1),
                                        false, false, 1);
@@ -112,16 +130,13 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
        }
      }
      assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
-    if (Bytes <= 0x101) {
-      // Copy the byte to the first location and then use MVC to copy
-      // it to the rest.
-      Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
-                           false, false, Align);
-      SDValue Dst2 = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
-                                 DAG.getConstant(1, DstVT));
-      return DAG.getNode(SystemZISD::MVC, DL, MVT::Other, Chain, Dst2, Dst,
-                         DAG.getConstant(Bytes - 1, MVT::i32));
-    }
+    // Copy the byte to the first location and then use MVC to copy
+    // it to the rest.
+    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
+                         false, false, Align);
+    SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, PtrVT));
+    return emitMVC(DAG, DL, Chain, DstPlus1, Dst, Bytes - 1);
    }
    return SDValue();
  }
@@ -144,13 +159,14 @@ EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                          SDValue Src1, SDValue Src2, SDValue Size,
                          MachinePointerInfo Op1PtrInfo,
                          MachinePointerInfo Op2PtrInfo) const {
+  EVT PtrVT = Src1.getValueType();
    if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
      uint64_t Bytes = CSize->getZExtValue();
      if (Bytes >= 1 && Bytes <= 0x100) {
        // A single CLC.
        SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
        Chain = DAG.getNode(SystemZISD::CLC, DL, VTs, Chain,
-                          Src1, Src2, Size);
+                          Src1, Src2, Size, DAG.getConstant(0, PtrVT));
        SDValue Glue = Chain.getValue(1);
        return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
      }
diff --git a/test/CodeGen/SystemZ/memcpy-01.ll b/test/CodeGen/SystemZ/memcpy-01.ll

index 7cb58b31cce65f8d1793734afaed8fe4f6e83a84..b53ec5452e25cf721e509251c3e149fc529fd23c 100644 (file)
--- a/test/CodeGen/SystemZ/memcpy-01.ll
+++ b/test/CodeGen/SystemZ/memcpy-01.ll
@@ -4,7 +4,9 @@
  
  declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *nocapture, i8 *nocapture, i32, i32, i1) nounwind
  declare void @llvm.memcpy.p0i8.p0i8.i64(i8 *nocapture, i8 *nocapture, i64, i32, i1) nounwind
+declare void @foo(i8 *, i8 *)
  
+; Test a no-op move, i32 version.
  define void @f1(i8 *%dest, i8 *%src) {
  ; CHECK-LABEL: f1:
  ; CHECK-NOT: %r2
@@ -15,6 +17,7 @@ define void @f1(i8 *%dest, i8 *%src) {
    ret void
  }
  
+; Test a no-op move, i64 version.
  define void @f2(i8 *%dest, i8 *%src) {
  ; CHECK-LABEL: f2:
  ; CHECK-NOT: %r2
@@ -25,6 +28,7 @@ define void @f2(i8 *%dest, i8 *%src) {
    ret void
  }
  
+; Test a 1-byte move, i32 version.
  define void @f3(i8 *%dest, i8 *%src) {
  ; CHECK-LABEL: f3:
  ; CHECK: mvc 0(1,%r2), 0(%r3)
@@ -34,6 +38,7 @@ define void @f3(i8 *%dest, i8 *%src) {
    ret void
  }
  
+; Test a 1-byte move, i64 version.
  define void @f4(i8 *%dest, i8 *%src) {
  ; CHECK-LABEL: f4:
  ; CHECK: mvc 0(1,%r2), 0(%r3)
@@ -43,6 +48,7 @@ define void @f4(i8 *%dest, i8 *%src) {
    ret void
  }
  
+; Test the upper range of a single MVC, i32 version.
  define void @f5(i8 *%dest, i8 *%src) {
  ; CHECK-LABEL: f5:
  ; CHECK: mvc 0(256,%r2), 0(%r3)
@@ -52,6 +58,7 @@ define void @f5(i8 *%dest, i8 *%src) {
    ret void
  }
  
+; Test the upper range of a single MVC, i64 version.
  define void @f6(i8 *%dest, i8 *%src) {
  ; CHECK-LABEL: f6:
  ; CHECK: mvc 0(256,%r2), 0(%r3)
@@ -61,22 +68,168 @@ define void @f6(i8 *%dest, i8 *%src) {
    ret void
  }
  
-; 257 bytes is too big for a single MVC.  For now expect none, so that
-; the test fails and gets updated when large copies are implemented.
+; Test the first case that needs two MVCs.
  define void @f7(i8 *%dest, i8 *%src) {
  ; CHECK-LABEL: f7:
-; CHECK-NOT: mvc
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(1,%r2), 256(%r3)
  ; CHECK: br %r14
    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%dest, i8 *%src, i32 257, i32 1,
                                         i1 false)
    ret void
  }
  
+; Test the last-but-one case that needs two MVCs.
  define void @f8(i8 *%dest, i8 *%src) {
  ; CHECK-LABEL: f8:
-; CHECK-NOT: mvc
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(255,%r2), 256(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 511, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; Test the last case that needs two MVCs.
+define void @f9(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f9:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 512, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; Test an arbitrary value that uses straight-line code.
+define void @f10(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f10:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: mvc 512(256,%r2), 512(%r3)
+; CHECK: mvc 768(256,%r2), 768(%r3)
+; CHECK: mvc 1024(255,%r2), 1024(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; ...and again in cases where not all parts are in range of MVC.
+define void @f11(i8 *%srcbase, i8 *%destbase) {
+; CHECK-LABEL: f11:
+; CHECK: mvc 4000(256,%r2), 3500(%r3)
+; CHECK: lay [[NEWDEST:%r[1-5]]], 4256(%r2)
+; CHECK: mvc 0(256,[[NEWDEST]]), 3756(%r3)
+; CHECK: mvc 256(256,[[NEWDEST]]), 4012(%r3)
+; CHECK: lay [[NEWSRC:%r[1-5]]], 4268(%r3)
+; CHECK: mvc 512(256,[[NEWDEST]]), 0([[NEWSRC]])
+; CHECK: mvc 768(255,[[NEWDEST]]), 256([[NEWSRC]])
+; CHECK: br %r14
+  %dest = getelementptr i8 *%srcbase, i64 4000
+  %src = getelementptr i8* %destbase, i64 3500
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; ...and again with a destination frame base that goes out of range.
+define void @f12() {
+; CHECK-LABEL: f12:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: mvc 4076(256,%r15), 2100(%r15)
+; CHECK: lay [[NEWDEST:%r[1-5]]], 4332(%r15)
+; CHECK: mvc 0(256,[[NEWDEST]]), 2356(%r15)
+; CHECK: mvc 256(256,[[NEWDEST]]), 2612(%r15)
+; CHECK: mvc 512(256,[[NEWDEST]]), 2868(%r15)
+; CHECK: mvc 768(255,[[NEWDEST]]), 3124(%r15)
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %arr = alloca [6000 x i8]
+  %dest = getelementptr [6000 x i8] *%arr, i64 0, i64 3900
+  %src = getelementptr [6000 x i8] *%arr, i64 0, i64 1924
+  call void @foo(i8 *%dest, i8 *%src)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  call void @foo(i8 *%dest, i8 *%src)
+  ret void
+}
+
+; ...and again with a source frame base that goes out of range.
+define void @f13() {
+; CHECK-LABEL: f13:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: mvc 200(256,%r15), 3826(%r15)
+; CHECK: mvc 456(256,%r15), 4082(%r15)
+; CHECK: lay [[NEWSRC:%r[1-5]]], 4338(%r15)
+; CHECK: mvc 712(256,%r15), 0([[NEWSRC]])
+; CHECK: mvc 968(256,%r15), 256([[NEWSRC]])
+; CHECK: mvc 1224(255,%r15), 512([[NEWSRC]])
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %arr = alloca [6000 x i8]
+  %dest = getelementptr [6000 x i8] *%arr, i64 0, i64 24
+  %src = getelementptr [6000 x i8] *%arr, i64 0, i64 3650
+  call void @foo(i8 *%dest, i8 *%src)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  call void @foo(i8 *%dest, i8 *%src)
+  ret void
+}
+
+; Test the last case that is done using straight-line code.
+define void @f14(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f14:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: mvc 512(256,%r2), 512(%r3)
+; CHECK: mvc 768(256,%r2), 768(%r3)
+; CHECK: mvc 1024(256,%r2), 1024(%r3)
+; CHECK: mvc 1280(256,%r2), 1280(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1536, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; Test the first case that is done using a loop.
+define void @f15(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f15:
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 768(%r2)
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: la %r2, 256(%r2)
+; CHECK: la %r3, 256(%r3)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 0(1,%r2), 0(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1537, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; ...and again with frame bases, where the base must be loaded into a
+; register before the loop.
+define void @f16() {
+; CHECK-LABEL: f16:
+; CHECK: brasl %r14, foo@PLT
+; CHECK-DAG: lghi [[COUNT:%r[0-5]]], 6
+; CHECK-DAG: la [[BASE:%r[0-5]]], 160(%r15)
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 2368([[BASE]])
+; CHECK: mvc 1600(256,[[BASE]]), 0([[BASE]])
+; CHECK: la [[BASE]], 256([[BASE]])
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 1600(1,[[BASE]]), 0([[BASE]])
+; CHECK: brasl %r14, foo@PLT
  ; CHECK: br %r14
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 257, i32 1,
+  %arr = alloca [3200 x i8]
+  %dest = getelementptr [3200 x i8] *%arr, i64 0, i64 1600
+  %src = getelementptr [3200 x i8] *%arr, i64 0, i64 0
+  call void @foo(i8 *%dest, i8 *%src)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1537, i32 1,
                                         i1 false)
+  call void @foo(i8 *%dest, i8 *%src)
    ret void
  }
diff --git a/test/CodeGen/SystemZ/memset-01.ll b/test/CodeGen/SystemZ/memset-01.ll

index b272a5bcc693eb89bbe323666a542188ff4ef21a..f17901cc73ab43003d2e83da81c783620ad8d81b 100644 (file)
--- a/test/CodeGen/SystemZ/memset-01.ll
+++ b/test/CodeGen/SystemZ/memset-01.ll
@@ -103,22 +103,58 @@ define void @f10(i8 *%dest, i8 %val) {
    ret void
  }
  
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version.  We need two MVCs.
  define void @f11(i8 *%dest, i8 %val) {
  ; CHECK-LABEL: f11:
-; CHECK-NOT: mvc
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
  ; CHECK: br %r14
    call void @llvm.memset.p0i8.i32(i8 *%dest, i8 %val, i32 258, i32 1, i1 false)
    ret void
  }
  
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
  define void @f12(i8 *%dest, i8 %val) {
  ; CHECK-LABEL: f12:
-; CHECK-NOT: mvc
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
  ; CHECK: br %r14
    call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 258, i32 1, i1 false)
    ret void
  }
+
+; Test the largest case for which straight-line code is used.
+define void @f13(i8 *%dest, i8 %val) {
+; CHECK-LABEL: f13:
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(256,%r2), 256(%r2)
+; CHECK: mvc 513(256,%r2), 512(%r2)
+; CHECK: mvc 769(256,%r2), 768(%r2)
+; CHECK: mvc 1025(256,%r2), 1024(%r2)
+; CHECK: mvc 1281(256,%r2), 1280(%r2)
+; CHECK: br %r14
+  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 1537, i32 1,
+                                  i1 false)
+  ret void
+}
+
+; Test the next size up, which uses a loop.  We leave the other corner
+; cases to memcpy-01.ll.
+define void @f14(i8 *%dest, i8 %val) {
+; CHECK-LABEL: f14:
+; CHECK: stc %r3, 0(%r2)
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 769(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: la %r2, 256(%r2)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 1538, i32 1,
+                                  i1 false)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/memset-02.ll b/test/CodeGen/SystemZ/memset-02.ll

index b74d907aa9a09b04159cf1a9d65caf05d0183df7..b4724c0b5745bde5e1635bc303cd3e542d19d659 100644 (file)
--- a/test/CodeGen/SystemZ/memset-02.ll
+++ b/test/CodeGen/SystemZ/memset-02.ll
@@ -139,21 +139,23 @@ define void @f14(i8 *%dest) {
    ret void
  }
  
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version.  We need two MVCs.
  define void @f15(i8 *%dest) {
  ; CHECK-LABEL: f15:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 128
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
  ; CHECK: br %r14
    call void @llvm.memset.p0i8.i32(i8 *%dest, i8 128, i32 258, i32 1, i1 false)
    ret void
  }
  
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
  define void @f16(i8 *%dest) {
  ; CHECK-LABEL: f16:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 128
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
  ; CHECK: br %r14
    call void @llvm.memset.p0i8.i64(i8 *%dest, i8 128, i64 258, i32 1, i1 false)
    ret void
diff --git a/test/CodeGen/SystemZ/memset-03.ll b/test/CodeGen/SystemZ/memset-03.ll

index 1d48f1ad6dcc191e0a0f905ca8171fc02b2c272e..3f954c4f79fa3a7acda71a3769cdc20f578ecfab 100644 (file)
--- a/test/CodeGen/SystemZ/memset-03.ll
+++ b/test/CodeGen/SystemZ/memset-03.ll
@@ -375,21 +375,23 @@ define void @f38(i8 *%dest) {
    ret void
  }
  
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version.  We need two MVCs.
  define void @f39(i8 *%dest) {
  ; CHECK-LABEL: f39:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 0
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
  ; CHECK: br %r14
    call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 258, i32 1, i1 false)
    ret void
  }
  
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
  define void @f40(i8 *%dest) {
  ; CHECK-LABEL: f40:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 0
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
  ; CHECK: br %r14
    call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 258, i32 1, i1 false)
    ret void
diff --git a/test/CodeGen/SystemZ/memset-04.ll b/test/CodeGen/SystemZ/memset-04.ll

index 92886921b07bd9fea6b0e3a9e31977a3a4ec3dd4..7906e8d10a1f1c86aaaafef41e7d5c1273359bf0 100644 (file)
--- a/test/CodeGen/SystemZ/memset-04.ll
+++ b/test/CodeGen/SystemZ/memset-04.ll
@@ -375,21 +375,23 @@ define void @f38(i8 *%dest) {
    ret void
  }
  
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version.  We need two MVCs.
  define void @f39(i8 *%dest) {
  ; CHECK-LABEL: f39:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 255
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
  ; CHECK: br %r14
    call void @llvm.memset.p0i8.i32(i8 *%dest, i8 -1, i32 258, i32 1, i1 false)
    ret void
  }
  
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
  define void @f40(i8 *%dest) {
  ; CHECK-LABEL: f40:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 255
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
  ; CHECK: br %r14
    call void @llvm.memset.p0i8.i64(i8 *%dest, i8 -1, i64 258, i32 1, i1 false)
    ret void
author	Richard Sandiford <rsandifo@linux.vnet.ibm.com>
	Tue, 27 Aug 2013 09:54:29 +0000 (09:54 +0000)
committer	Richard Sandiford <rsandifo@linux.vnet.ibm.com>
	Tue, 27 Aug 2013 09:54:29 +0000 (09:54 +0000)
lib/Target/SystemZ/SystemZISelLowering.cpp		patch \| blob \| history
lib/Target/SystemZ/SystemZISelLowering.h		patch \| blob \| history
lib/Target/SystemZ/SystemZInstrFP.td		patch \| blob \| history
lib/Target/SystemZ/SystemZInstrFormats.td		patch \| blob \| history
lib/Target/SystemZ/SystemZInstrInfo.td		patch \| blob \| history
lib/Target/SystemZ/SystemZOperands.td		patch \| blob \| history
lib/Target/SystemZ/SystemZOperators.td		patch \| blob \| history
lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp		patch \| blob \| history
test/CodeGen/SystemZ/memcpy-01.ll		patch \| blob \| history
test/CodeGen/SystemZ/memset-01.ll		patch \| blob \| history
test/CodeGen/SystemZ/memset-02.ll		patch \| blob \| history
test/CodeGen/SystemZ/memset-03.ll		patch \| blob \| history
test/CodeGen/SystemZ/memset-04.ll		patch \| blob \| history