From: Richard Sandiford Date: Tue, 2 Jul 2013 15:28:56 +0000 (+0000) Subject: [SystemZ] Use MVC to spill loads and stores X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=1ce4894a3f1ce6e63c1b109c24235d81dea2908f [SystemZ] Use MVC to spill loads and stores Try to use MVC when spilling the destination of a simple load or the source of a simple store. As explained in the comment, this doesn't yet handle the case where the load or store location is also a frame index, since that could lead to two simultaneous scavenger spills, something the backend can't handle yet. spill-02.py tests that this restriction kicks in, but unfortunately I've not yet found a case that would fail without it. The volatile trick I used for other scavenger tests doesn't work here because we can't use MVC for volatile accesses anyway. I'm planning on relaxing the restriction later, hopefully with a test that does trigger the problem... Tests @f8 and @f9 also showed that L(G)RL and ST(G)RL were wrongly classified as SimpleBDX{Load,Store}. It wouldn't be easy to test for that bug separately, which is why I didn't split out the fix as a separate patch. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185434 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 0d304323547..af3b7110dbe 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -13,6 +13,7 @@ #include "SystemZInstrInfo.h" #include "SystemZInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Target/TargetMachine.h" #define GET_INSTRINFO_CTOR @@ -80,7 +81,8 @@ void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const { // Return 0 otherwise. // // Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores. -static int isSimpleMove(const MachineInstr *MI, int &FrameIndex, int Flag) { +static int isSimpleMove(const MachineInstr *MI, int &FrameIndex, + unsigned Flag) { const MCInstrDesc &MCID = MI->getDesc(); if ((MCID.TSFlags & Flag) && MI->getOperand(1).isFI() && @@ -315,6 +317,96 @@ SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, FrameIdx); } +// Return true if MI is a simple load or store with a 12-bit displacement +// and no index. Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores. +static bool isSimpleBD12Move(const MachineInstr *MI, unsigned Flag) { + const MCInstrDesc &MCID = MI->getDesc(); + return ((MCID.TSFlags & Flag) && + isUInt<12>(MI->getOperand(2).getImm()) && + MI->getOperand(3).getReg() == 0); +} + +// Return a MachineMemOperand for FrameIndex with flags MMOFlags. +// Offset is the byte offset from the start of FrameIndex. +static MachineMemOperand *getFrameMMO(MachineFunction &MF, int FrameIndex, + uint64_t &Offset, unsigned MMOFlags) { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const Value *V = PseudoSourceValue::getFixedStack(FrameIndex); + return MF.getMachineMemOperand(MachinePointerInfo(V, Offset), MMOFlags, + MFI->getObjectSize(FrameIndex), + MFI->getObjectAlignment(FrameIndex)); +} + +MachineInstr * +SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl &Ops, + int FrameIndex) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned Size = MFI->getObjectSize(FrameIndex); + + // Eary exit for cases we don't care about + if (Ops.size() != 1) + return 0; + + unsigned OpNum = Ops[0]; + unsigned Reg = MI->getOperand(OpNum).getReg(); + unsigned RegSize = MF.getRegInfo().getRegClass(Reg)->getSize(); + assert(Size == RegSize && "Invalid size combination"); + + // Look for cases where the source of a simple store or the destination + // of a simple load is being spilled. Try to use MVC instead. + // + // Although MVC is in practice a fast choice in these cases, it is still + // logically a bytewise copy. This means that we cannot use it if the + // load or store is volatile. It also means that the transformation is + // not valid in cases where the two memories partially overlap; however, + // that is not a problem here, because we know that one of the memories + // is a full frame index. + // + // For now we punt if the load or store is also to a frame index. + // In that case we might end up eliminating both of them to out-of-range + // offsets, which might then force the register scavenger to spill two + // other registers. The backend can only handle one such scavenger spill + // at a time. + if (OpNum == 0 && MI->hasOneMemOperand()) { + MachineMemOperand *MMO = *MI->memoperands_begin(); + if (MMO->getSize() == Size && !MMO->isVolatile()) { + // Handle conversion of loads. + if (isSimpleBD12Move(MI, SystemZII::SimpleBDXLoad) && + !MI->getOperand(1).isFI()) { + uint64_t Offset = 0; + MachineMemOperand *FrameMMO = getFrameMMO(MF, FrameIndex, Offset, + MachineMemOperand::MOStore); + return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::MVC)) + .addFrameIndex(FrameIndex).addImm(Offset).addImm(Size) + .addOperand(MI->getOperand(1)).addImm(MI->getOperand(2).getImm()) + .addMemOperand(FrameMMO).addMemOperand(MMO); + } + // Handle conversion of stores. + if (isSimpleBD12Move(MI, SystemZII::SimpleBDXStore) && + !MI->getOperand(1).isFI()) { + uint64_t Offset = 0; + MachineMemOperand *FrameMMO = getFrameMMO(MF, FrameIndex, Offset, + MachineMemOperand::MOLoad); + return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::MVC)) + .addOperand(MI->getOperand(1)).addImm(MI->getOperand(2).getImm()) + .addImm(Size).addFrameIndex(FrameIndex).addImm(Offset) + .addMemOperand(MMO).addMemOperand(FrameMMO); + } + } + } + + return 0; +} + +MachineInstr * +SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI, + const SmallVectorImpl &Ops, + MachineInstr* LoadMI) const { + return 0; +} + bool SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { switch (MI->getOpcode()) { diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h index d6980f71713..8d9a3eaacff 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/lib/Target/SystemZ/SystemZInstrInfo.h @@ -111,6 +111,14 @@ public: unsigned DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const LLVM_OVERRIDE; + virtual MachineInstr * + foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + const SmallVectorImpl &Ops, + int FrameIndex) const; + virtual MachineInstr * + foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI, + const SmallVectorImpl &Ops, + MachineInstr* LoadMI) const; virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const LLVM_OVERRIDE; virtual bool diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index 3af41e57546..1b53eb0a428 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -242,11 +242,8 @@ let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1, // Register loads. let canFoldAsLoad = 1, SimpleBDXLoad = 1 in { - defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32>; - def LRL : UnaryRILPC<"lrl", 0xC4D, aligned_load, GR32>; - - def LG : UnaryRXY<"lg", 0xE304, load, GR64>; - def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>; + defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32>; + def LG : UnaryRXY<"lg", 0xE304, load, GR64>; // These instructions are split after register allocation, so we don't // want a custom inserter. @@ -255,16 +252,16 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in { [(set GR128:$dst, (load bdxaddr20only128:$src))]>; } } +let canFoldAsLoad = 1 in { + def LRL : UnaryRILPC<"lrl", 0xC4D, aligned_load, GR32>; + def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>; +} // Register stores. let SimpleBDXStore = 1 in { - let isCodeGenOnly = 1 in { - defm ST32 : StoreRXPair<"st", 0x50, 0xE350, store, GR32>; - def STRL32 : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>; - } - - def STG : StoreRXY<"stg", 0xE324, store, GR64>; - def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>; + let isCodeGenOnly = 1 in + defm ST32 : StoreRXPair<"st", 0x50, 0xE350, store, GR32>; + def STG : StoreRXY<"stg", 0xE324, store, GR64>; // These instructions are split after register allocation, so we don't // want a custom inserter. @@ -273,6 +270,9 @@ let SimpleBDXStore = 1 in { [(store GR128:$src, bdxaddr20only128:$dst)]>; } } +let isCodeGenOnly = 1 in + def STRL32 : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>; +def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>; // 8-bit immediate stores to 8-bit fields. defm MVI : StoreSIPair<"mvi", 0x92, 0xEB52, truncstorei8, imm32zx8trunc>; diff --git a/test/CodeGen/SystemZ/Large/spill-01.py b/test/CodeGen/SystemZ/Large/spill-01.py new file mode 100644 index 00000000000..3c1d0b611bb --- /dev/null +++ b/test/CodeGen/SystemZ/Large/spill-01.py @@ -0,0 +1,40 @@ +# Test cases where MVC is used for spill slots that end up being out of range. +# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s + +# There are 8 usable call-saved GPRs, two of which are needed for the base +# registers. The first 160 bytes of the frame are needed for the ABI +# call frame, and a further 8 bytes are needed for the emergency spill slot. +# That means we will have at least one out-of-range slot if: +# +# count == (4096 - 168) / 8 + 6 + 1 == 498 +# +# Add in some extra room and check both %r15+4096 (the first out-of-range slot) +# and %r15+4104. +# +# CHECK: f1: +# CHECK: lay [[REG:%r[0-5]]], 4096(%r15) +# CHECK: mvc 0(8,[[REG]]), {{[0-9]+}}({{%r[0-9]+}}) +# CHECK: brasl %r14, foo@PLT +# CHECK: lay [[REG:%r[0-5]]], 4096(%r15) +# CHECK: mvc {{[0-9]+}}(8,{{%r[0-9]+}}), 8([[REG]]) +# CHECK: br %r14 +count = 500 + +print 'declare void @foo()' +print '' +print 'define void @f1(i64 *%base0, i64 *%base1) {' + +for i in range(count): + print ' %%ptr%d = getelementptr i64 *%%base%d, i64 %d' % (i, i % 2, i / 2) + print ' %%val%d = load i64 *%%ptr%d' % (i, i) + print '' + +print ' call void @foo()' +print '' + +for i in range(count): + print ' store i64 %%val%d, i64 *%%ptr%d' % (i, i) + +print '' +print ' ret void' +print '}' diff --git a/test/CodeGen/SystemZ/Large/spill-02.py b/test/CodeGen/SystemZ/Large/spill-02.py new file mode 100644 index 00000000000..0eba3edecb7 --- /dev/null +++ b/test/CodeGen/SystemZ/Large/spill-02.py @@ -0,0 +1,46 @@ +# Test cases where we spill from one frame index to another, both of which +# would be out of range of MVC. At present we don't use MVC in this case. +# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s + +# There are 8 usable call-saved GPRs. The first 160 bytes of the frame +# are needed for the ABI call frame, and a further 8 bytes are needed +# for the emergency spill slot. That means we will have at least one +# out-of-range slot if: +# +# count == (4096 - 168) / 8 + 8 + 1 == 500 +# +# Add in some extra just to be sure. +# +# CHECK: f1: +# CHECK-NOT: mvc +# CHECK: br %r14 +count = 510 + +print 'declare void @foo(i64 *%base0, i64 *%base1)' +print '' +print 'define void @f1() {' + +for i in range(2): + print ' %%alloc%d = alloca [%d x i64]' % (i, count / 2) + print (' %%base%d = getelementptr [%d x i64] * %%alloc%d, i64 0, i64 0' + % (i, count / 2, i)) + +print ' call void @foo(i64 *%base0, i64 *%base1)' +print '' + +for i in range(count): + print ' %%ptr%d = getelementptr i64 *%%base%d, i64 %d' % (i, i % 2, i / 2) + print ' %%val%d = load i64 *%%ptr%d' % (i, i) + print '' + +print ' call void @foo(i64 *%base0, i64 *%base1)' +print '' + +for i in range (count): + print ' store i64 %%val%d, i64 *%%ptr%d' % (i, i) + +print '' +print ' call void @foo(i64 *%base0, i64 *%base1)' +print '' +print ' ret void' +print '}' diff --git a/test/CodeGen/SystemZ/spill-01.ll b/test/CodeGen/SystemZ/spill-01.ll new file mode 100644 index 00000000000..a5e48eefcc8 --- /dev/null +++ b/test/CodeGen/SystemZ/spill-01.ll @@ -0,0 +1,383 @@ +; Test spilling using MVC. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +declare void @foo() + +@g0 = global i32 0 +@g1 = global i32 1 +@g2 = global i32 2 +@g3 = global i32 3 +@g4 = global i32 4 +@g5 = global i32 5 +@g6 = global i32 6 +@g7 = global i32 7 +@g8 = global i32 8 +@g9 = global i32 9 + +@h0 = global i64 0 +@h1 = global i64 1 +@h2 = global i64 2 +@h3 = global i64 3 +@h4 = global i64 4 +@h5 = global i64 5 +@h6 = global i64 6 +@h7 = global i64 7 +@h8 = global i64 8 +@h9 = global i64 9 + +; This function shouldn't spill anything +define void @f1(i32 *%ptr0) { +; CHECK: f1: +; CHECK: stmg +; CHECK: aghi %r15, -160 +; CHECK-NOT: %r15 +; CHECK: brasl %r14, foo@PLT +; CHECK-NOT: %r15 +; CHECK: lmg +; CHECK: br %r14 + %ptr1 = getelementptr i32 *%ptr0, i32 2 + %ptr2 = getelementptr i32 *%ptr0, i32 4 + %ptr3 = getelementptr i32 *%ptr0, i32 6 + %ptr4 = getelementptr i32 *%ptr0, i32 8 + %ptr5 = getelementptr i32 *%ptr0, i32 10 + %ptr6 = getelementptr i32 *%ptr0, i32 12 + + %val0 = load i32 *%ptr0 + %val1 = load i32 *%ptr1 + %val2 = load i32 *%ptr2 + %val3 = load i32 *%ptr3 + %val4 = load i32 *%ptr4 + %val5 = load i32 *%ptr5 + %val6 = load i32 *%ptr6 + + call void @foo() + + store i32 %val0, i32 *%ptr0 + store i32 %val1, i32 *%ptr1 + store i32 %val2, i32 *%ptr2 + store i32 %val3, i32 *%ptr3 + store i32 %val4, i32 *%ptr4 + store i32 %val5, i32 *%ptr5 + store i32 %val6, i32 *%ptr6 + + ret void +} + +; Test a case where at least one i32 load and at least one i32 store +; need spills. +define void @f2(i32 *%ptr0) { +; CHECK: f2: +; CHECK: mvc [[OFFSET1:16[04]]](4,%r15), [[OFFSET2:[0-9]+]]({{%r[0-9]+}}) +; CHECK: brasl %r14, foo@PLT +; CHECK: mvc [[OFFSET2]](4,{{%r[0-9]+}}), [[OFFSET1]](%r15) +; CHECK: br %r14 + %ptr1 = getelementptr i32 *%ptr0, i64 2 + %ptr2 = getelementptr i32 *%ptr0, i64 4 + %ptr3 = getelementptr i32 *%ptr0, i64 6 + %ptr4 = getelementptr i32 *%ptr0, i64 8 + %ptr5 = getelementptr i32 *%ptr0, i64 10 + %ptr6 = getelementptr i32 *%ptr0, i64 12 + %ptr7 = getelementptr i32 *%ptr0, i64 14 + %ptr8 = getelementptr i32 *%ptr0, i64 16 + + %val0 = load i32 *%ptr0 + %val1 = load i32 *%ptr1 + %val2 = load i32 *%ptr2 + %val3 = load i32 *%ptr3 + %val4 = load i32 *%ptr4 + %val5 = load i32 *%ptr5 + %val6 = load i32 *%ptr6 + %val7 = load i32 *%ptr7 + %val8 = load i32 *%ptr8 + + call void @foo() + + store i32 %val0, i32 *%ptr0 + store i32 %val1, i32 *%ptr1 + store i32 %val2, i32 *%ptr2 + store i32 %val3, i32 *%ptr3 + store i32 %val4, i32 *%ptr4 + store i32 %val5, i32 *%ptr5 + store i32 %val6, i32 *%ptr6 + store i32 %val7, i32 *%ptr7 + store i32 %val8, i32 *%ptr8 + + ret void +} + +; Test a case where at least one i64 load and at least one i64 store +; need spills. +define void @f3(i64 *%ptr0) { +; CHECK: f3: +; CHECK: mvc 160(8,%r15), [[OFFSET:[0-9]+]]({{%r[0-9]+}}) +; CHECK: brasl %r14, foo@PLT +; CHECK: mvc [[OFFSET]](8,{{%r[0-9]+}}), 160(%r15) +; CHECK: br %r14 + %ptr1 = getelementptr i64 *%ptr0, i64 2 + %ptr2 = getelementptr i64 *%ptr0, i64 4 + %ptr3 = getelementptr i64 *%ptr0, i64 6 + %ptr4 = getelementptr i64 *%ptr0, i64 8 + %ptr5 = getelementptr i64 *%ptr0, i64 10 + %ptr6 = getelementptr i64 *%ptr0, i64 12 + %ptr7 = getelementptr i64 *%ptr0, i64 14 + %ptr8 = getelementptr i64 *%ptr0, i64 16 + + %val0 = load i64 *%ptr0 + %val1 = load i64 *%ptr1 + %val2 = load i64 *%ptr2 + %val3 = load i64 *%ptr3 + %val4 = load i64 *%ptr4 + %val5 = load i64 *%ptr5 + %val6 = load i64 *%ptr6 + %val7 = load i64 *%ptr7 + %val8 = load i64 *%ptr8 + + call void @foo() + + store i64 %val0, i64 *%ptr0 + store i64 %val1, i64 *%ptr1 + store i64 %val2, i64 *%ptr2 + store i64 %val3, i64 *%ptr3 + store i64 %val4, i64 *%ptr4 + store i64 %val5, i64 *%ptr5 + store i64 %val6, i64 *%ptr6 + store i64 %val7, i64 *%ptr7 + store i64 %val8, i64 *%ptr8 + + ret void +} + + +; Test a case where at least at least one f32 load and at least one f32 store +; need spills. The 8 call-saved FPRs could be used for 8 of the %vals +; (and are at the time of writing), but it would really be better to use +; MVC for all 10. +define void @f4(float *%ptr0) { +; CHECK: f4: +; CHECK: mvc [[OFFSET1:16[04]]](4,%r15), [[OFFSET2:[0-9]+]]({{%r[0-9]+}}) +; CHECK: brasl %r14, foo@PLT +; CHECK: mvc [[OFFSET2]](4,{{%r[0-9]+}}), [[OFFSET1]](%r15) +; CHECK: br %r14 + %ptr1 = getelementptr float *%ptr0, i64 2 + %ptr2 = getelementptr float *%ptr0, i64 4 + %ptr3 = getelementptr float *%ptr0, i64 6 + %ptr4 = getelementptr float *%ptr0, i64 8 + %ptr5 = getelementptr float *%ptr0, i64 10 + %ptr6 = getelementptr float *%ptr0, i64 12 + %ptr7 = getelementptr float *%ptr0, i64 14 + %ptr8 = getelementptr float *%ptr0, i64 16 + %ptr9 = getelementptr float *%ptr0, i64 18 + + %val0 = load float *%ptr0 + %val1 = load float *%ptr1 + %val2 = load float *%ptr2 + %val3 = load float *%ptr3 + %val4 = load float *%ptr4 + %val5 = load float *%ptr5 + %val6 = load float *%ptr6 + %val7 = load float *%ptr7 + %val8 = load float *%ptr8 + %val9 = load float *%ptr9 + + call void @foo() + + store float %val0, float *%ptr0 + store float %val1, float *%ptr1 + store float %val2, float *%ptr2 + store float %val3, float *%ptr3 + store float %val4, float *%ptr4 + store float %val5, float *%ptr5 + store float %val6, float *%ptr6 + store float %val7, float *%ptr7 + store float %val8, float *%ptr8 + store float %val9, float *%ptr9 + + ret void +} + +; Similarly for f64. +define void @f5(double *%ptr0) { +; CHECK: f5: +; CHECK: mvc 160(8,%r15), [[OFFSET:[0-9]+]]({{%r[0-9]+}}) +; CHECK: brasl %r14, foo@PLT +; CHECK: mvc [[OFFSET]](8,{{%r[0-9]+}}), 160(%r15) +; CHECK: br %r14 + %ptr1 = getelementptr double *%ptr0, i64 2 + %ptr2 = getelementptr double *%ptr0, i64 4 + %ptr3 = getelementptr double *%ptr0, i64 6 + %ptr4 = getelementptr double *%ptr0, i64 8 + %ptr5 = getelementptr double *%ptr0, i64 10 + %ptr6 = getelementptr double *%ptr0, i64 12 + %ptr7 = getelementptr double *%ptr0, i64 14 + %ptr8 = getelementptr double *%ptr0, i64 16 + %ptr9 = getelementptr double *%ptr0, i64 18 + + %val0 = load double *%ptr0 + %val1 = load double *%ptr1 + %val2 = load double *%ptr2 + %val3 = load double *%ptr3 + %val4 = load double *%ptr4 + %val5 = load double *%ptr5 + %val6 = load double *%ptr6 + %val7 = load double *%ptr7 + %val8 = load double *%ptr8 + %val9 = load double *%ptr9 + + call void @foo() + + store double %val0, double *%ptr0 + store double %val1, double *%ptr1 + store double %val2, double *%ptr2 + store double %val3, double *%ptr3 + store double %val4, double *%ptr4 + store double %val5, double *%ptr5 + store double %val6, double *%ptr6 + store double %val7, double *%ptr7 + store double %val8, double *%ptr8 + store double %val9, double *%ptr9 + + ret void +} + +; Repeat f2 with atomic accesses. We shouldn't use MVC here. +define void @f6(i32 *%ptr0) { +; CHECK: f6: +; CHECK-NOT: mvc +; CHECK: br %r14 + %ptr1 = getelementptr i32 *%ptr0, i64 2 + %ptr2 = getelementptr i32 *%ptr0, i64 4 + %ptr3 = getelementptr i32 *%ptr0, i64 6 + %ptr4 = getelementptr i32 *%ptr0, i64 8 + %ptr5 = getelementptr i32 *%ptr0, i64 10 + %ptr6 = getelementptr i32 *%ptr0, i64 12 + %ptr7 = getelementptr i32 *%ptr0, i64 14 + %ptr8 = getelementptr i32 *%ptr0, i64 16 + + %val0 = load atomic i32 *%ptr0 unordered, align 4 + %val1 = load atomic i32 *%ptr1 unordered, align 4 + %val2 = load atomic i32 *%ptr2 unordered, align 4 + %val3 = load atomic i32 *%ptr3 unordered, align 4 + %val4 = load atomic i32 *%ptr4 unordered, align 4 + %val5 = load atomic i32 *%ptr5 unordered, align 4 + %val6 = load atomic i32 *%ptr6 unordered, align 4 + %val7 = load atomic i32 *%ptr7 unordered, align 4 + %val8 = load atomic i32 *%ptr8 unordered, align 4 + + call void @foo() + + store atomic i32 %val0, i32 *%ptr0 unordered, align 4 + store atomic i32 %val1, i32 *%ptr1 unordered, align 4 + store atomic i32 %val2, i32 *%ptr2 unordered, align 4 + store atomic i32 %val3, i32 *%ptr3 unordered, align 4 + store atomic i32 %val4, i32 *%ptr4 unordered, align 4 + store atomic i32 %val5, i32 *%ptr5 unordered, align 4 + store atomic i32 %val6, i32 *%ptr6 unordered, align 4 + store atomic i32 %val7, i32 *%ptr7 unordered, align 4 + store atomic i32 %val8, i32 *%ptr8 unordered, align 4 + + ret void +} + +; ...likewise volatile accesses. +define void @f7(i32 *%ptr0) { +; CHECK: f7: +; CHECK-NOT: mvc +; CHECK: br %r14 + %ptr1 = getelementptr i32 *%ptr0, i64 2 + %ptr2 = getelementptr i32 *%ptr0, i64 4 + %ptr3 = getelementptr i32 *%ptr0, i64 6 + %ptr4 = getelementptr i32 *%ptr0, i64 8 + %ptr5 = getelementptr i32 *%ptr0, i64 10 + %ptr6 = getelementptr i32 *%ptr0, i64 12 + %ptr7 = getelementptr i32 *%ptr0, i64 14 + %ptr8 = getelementptr i32 *%ptr0, i64 16 + + %val0 = load volatile i32 *%ptr0 + %val1 = load volatile i32 *%ptr1 + %val2 = load volatile i32 *%ptr2 + %val3 = load volatile i32 *%ptr3 + %val4 = load volatile i32 *%ptr4 + %val5 = load volatile i32 *%ptr5 + %val6 = load volatile i32 *%ptr6 + %val7 = load volatile i32 *%ptr7 + %val8 = load volatile i32 *%ptr8 + + call void @foo() + + store volatile i32 %val0, i32 *%ptr0 + store volatile i32 %val1, i32 *%ptr1 + store volatile i32 %val2, i32 *%ptr2 + store volatile i32 %val3, i32 *%ptr3 + store volatile i32 %val4, i32 *%ptr4 + store volatile i32 %val5, i32 *%ptr5 + store volatile i32 %val6, i32 *%ptr6 + store volatile i32 %val7, i32 *%ptr7 + store volatile i32 %val8, i32 *%ptr8 + + ret void +} + +; Check that LRL and STRL are not converted. +define void @f8() { +; CHECK: f8: +; CHECK-NOT: mvc +; CHECK: br %r14 + %val0 = load i32 *@g0 + %val1 = load i32 *@g1 + %val2 = load i32 *@g2 + %val3 = load i32 *@g3 + %val4 = load i32 *@g4 + %val5 = load i32 *@g5 + %val6 = load i32 *@g6 + %val7 = load i32 *@g7 + %val8 = load i32 *@g8 + %val9 = load i32 *@g9 + + call void @foo() + + store i32 %val0, i32 *@g0 + store i32 %val1, i32 *@g1 + store i32 %val2, i32 *@g2 + store i32 %val3, i32 *@g3 + store i32 %val4, i32 *@g4 + store i32 %val5, i32 *@g5 + store i32 %val6, i32 *@g6 + store i32 %val7, i32 *@g7 + store i32 %val8, i32 *@g8 + store i32 %val9, i32 *@g9 + + ret void +} + +; Likewise LGRL and STGRL. +define void @f9() { +; CHECK: f9: +; CHECK-NOT: mvc +; CHECK: br %r14 + %val0 = load i64 *@h0 + %val1 = load i64 *@h1 + %val2 = load i64 *@h2 + %val3 = load i64 *@h3 + %val4 = load i64 *@h4 + %val5 = load i64 *@h5 + %val6 = load i64 *@h6 + %val7 = load i64 *@h7 + %val8 = load i64 *@h8 + %val9 = load i64 *@h9 + + call void @foo() + + store i64 %val0, i64 *@h0 + store i64 %val1, i64 *@h1 + store i64 %val2, i64 *@h2 + store i64 %val3, i64 *@h3 + store i64 %val4, i64 *@h4 + store i64 %val5, i64 *@h5 + store i64 %val6, i64 *@h6 + store i64 %val7, i64 *@h7 + store i64 %val8, i64 *@h8 + store i64 %val9, i64 *@h9 + + ret void +}