From 1c2f863df926830940663ab78550b6837d1a894a Mon Sep 17 00:00:00 2001 From: Hao Liu Date: Thu, 8 May 2014 07:38:13 +0000 Subject: [PATCH] AArch64/ARM64: Port NEON post-increment load/store with 2/3/4 vectors to ARM64 backend. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208284 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM64/ARM64ISelDAGToDAG.cpp | 584 ++- lib/Target/ARM64/ARM64ISelLowering.cpp | 190 + lib/Target/ARM64/ARM64ISelLowering.h | 25 +- test/CodeGen/ARM64/indexed-vector-ldst.ll | 5077 +++++++++++++++++++++ 4 files changed, 5820 insertions(+), 56 deletions(-) diff --git a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp index d9c945ccd92..f216f79255e 100644 --- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp +++ b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp @@ -150,10 +150,15 @@ public: SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, unsigned SubRegIdx); + SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + unsigned SubRegIdx); SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node); SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node); @@ -952,33 +957,43 @@ SDNode *ARM64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); SDValue SuperReg = SDValue(Ld, 0); - - // MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - // MemOp[0] = cast(N)->getMemOperand(); - // cast(Ld)->setMemRefs(MemOp, MemOp + 1); - - switch (NumVecs) { - case 4: - ReplaceUses(SDValue(N, 3), CurDAG->getTargetExtractSubreg(SubRegIdx + 3, dl, - VT, SuperReg)); - // FALLTHROUGH - case 3: - ReplaceUses(SDValue(N, 2), CurDAG->getTargetExtractSubreg(SubRegIdx + 2, dl, - VT, SuperReg)); - // FALLTHROUGH - case 2: - ReplaceUses(SDValue(N, 1), CurDAG->getTargetExtractSubreg(SubRegIdx + 1, dl, - VT, SuperReg)); - ReplaceUses(SDValue(N, 0), - CurDAG->getTargetExtractSubreg(SubRegIdx, dl, VT, SuperReg)); - break; - case 1: - ReplaceUses(SDValue(N, 0), SuperReg); - break; - } + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), + CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); + return nullptr; +} + +SDNode *ARM64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, + unsigned Opc, unsigned SubRegIdx) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + SmallVector Ops; + Ops.push_back(N->getOperand(1)); // Mem operand + Ops.push_back(N->getOperand(2)); // Incremental + Ops.push_back(Chain); + + std::vector ResTys; + ResTys.push_back(MVT::i64); // Type of the write back register + ResTys.push_back(MVT::Untyped); + ResTys.push_back(MVT::Other); + + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Update uses of write back register + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0)); + + // Update uses of vector list + SDValue SuperReg = SDValue(Ld, 1); + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), + CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); + // Update the chain + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); return nullptr; } @@ -1001,6 +1016,29 @@ SDNode *ARM64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, return St; } +SDNode *ARM64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getOperand(2)->getValueType(0); + SmallVector ResTys; + ResTys.push_back(MVT::i64); // Type of the write back register + ResTys.push_back(MVT::Other); // Type for the Chain + + // Form a REG_SEQUENCE to force register allocation. + bool Is128Bit = VT.getSizeInBits() == 128; + SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); + + SmallVector Ops; + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(NumVecs + 1)); // base register + Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental + Ops.push_back(N->getOperand(0)); // Chain + SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + return St; +} + /// WidenVector - Given a value in the V64 register class, produce the /// equivalent value in the V128 register class. class WidenVector { @@ -1065,42 +1103,68 @@ SDNode *ARM64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, SDValue SuperReg = SDValue(Ld, 0); EVT WideVT = RegSeq.getOperand(1)->getValueType(0); - switch (NumVecs) { - case 4: { - SDValue NV3 = - CurDAG->getTargetExtractSubreg(ARM64::qsub3, dl, WideVT, SuperReg); + static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2, + ARM64::qsub3 }; + for (unsigned i = 0; i < NumVecs; ++i) { + SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); if (Narrow) - ReplaceUses(SDValue(N, 3), NarrowVector(NV3, *CurDAG)); - else - ReplaceUses(SDValue(N, 3), NV3); + NV = NarrowVector(NV, *CurDAG); + ReplaceUses(SDValue(N, i), NV); } - // FALLTHROUGH - case 3: { - SDValue NV2 = - CurDAG->getTargetExtractSubreg(ARM64::qsub2, dl, WideVT, SuperReg); + + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); + + return Ld; +} + +SDNode *ARM64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + bool Narrow = VT.getSizeInBits() == 64; + + // Form a REG_SEQUENCE to force register allocation. + SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + + if (Narrow) + std::transform(Regs.begin(), Regs.end(), Regs.begin(), + WidenVector(*CurDAG)); + + SDValue RegSeq = createQTuple(Regs); + + std::vector ResTys; + ResTys.push_back(MVT::i64); // Type of the write back register + ResTys.push_back(MVT::Untyped); + ResTys.push_back(MVT::Other); + + unsigned LaneNo = + cast(N->getOperand(NumVecs + 1))->getZExtValue(); + + SmallVector Ops; + Ops.push_back(RegSeq); + Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number + Ops.push_back(N->getOperand(NumVecs + 2)); // Base register + Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental + Ops.push_back(N->getOperand(0)); + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Update uses of the write back register + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0)); + + // Update uses of the vector list + SDValue SuperReg = SDValue(Ld, 1); + EVT WideVT = RegSeq.getOperand(1)->getValueType(0); + static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2, + ARM64::qsub3 }; + for (unsigned i = 0; i < NumVecs; ++i) { + SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); if (Narrow) - ReplaceUses(SDValue(N, 2), NarrowVector(NV2, *CurDAG)); - else - ReplaceUses(SDValue(N, 2), NV2); - } - // FALLTHROUGH - case 2: { - SDValue NV1 = - CurDAG->getTargetExtractSubreg(ARM64::qsub1, dl, WideVT, SuperReg); - SDValue NV0 = - CurDAG->getTargetExtractSubreg(ARM64::qsub0, dl, WideVT, SuperReg); - if (Narrow) { - ReplaceUses(SDValue(N, 1), NarrowVector(NV1, *CurDAG)); - ReplaceUses(SDValue(N, 0), NarrowVector(NV0, *CurDAG)); - } else { - ReplaceUses(SDValue(N, 1), NV1); - ReplaceUses(SDValue(N, 0), NV0); - } - break; - } + NV = NarrowVector(NV, *CurDAG); + ReplaceUses(SDValue(N, i), NV); } - ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); + // Update the Chain + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); return Ld; } @@ -1138,6 +1202,44 @@ SDNode *ARM64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, return St; } +SDNode *ARM64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getOperand(2)->getValueType(0); + bool Narrow = VT.getSizeInBits() == 64; + + // Form a REG_SEQUENCE to force register allocation. + SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + + if (Narrow) + std::transform(Regs.begin(), Regs.end(), Regs.begin(), + WidenVector(*CurDAG)); + + SDValue RegSeq = createQTuple(Regs); + + SmallVector ResTys; + ResTys.push_back(MVT::i64); // Type of the write back register + ResTys.push_back(MVT::Other); + + unsigned LaneNo = + cast(N->getOperand(NumVecs + 1))->getZExtValue(); + + SmallVector Ops; + Ops.push_back(RegSeq); + Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); + Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register + Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental + Ops.push_back(N->getOperand(0)); + SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(St)->setMemRefs(MemOp, MemOp + 1); + + return St; +} + static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, SDValue &Opd0, unsigned &LSB, unsigned &MSB, @@ -2441,6 +2543,378 @@ SDNode *ARM64DAGToDAGISel::Select(SDNode *Node) { } } } + case ARM64ISD::LD2post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 2, ARM64::LD2Twov8b_POST, ARM64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 2, ARM64::LD2Twov16b_POST, ARM64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 2, ARM64::LD2Twov4h_POST, ARM64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 2, ARM64::LD2Twov8h_POST, ARM64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 2, ARM64::LD2Twov2s_POST, ARM64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 2, ARM64::LD2Twov4s_POST, ARM64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 2, ARM64::LD1Twov1d_POST, ARM64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 2, ARM64::LD2Twov2d_POST, ARM64::qsub0); + break; + } + case ARM64ISD::LD3post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 3, ARM64::LD3Threev8b_POST, ARM64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 3, ARM64::LD3Threev16b_POST, ARM64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 3, ARM64::LD3Threev4h_POST, ARM64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 3, ARM64::LD3Threev8h_POST, ARM64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 3, ARM64::LD3Threev2s_POST, ARM64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 3, ARM64::LD3Threev4s_POST, ARM64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 3, ARM64::LD1Threev1d_POST, ARM64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 3, ARM64::LD3Threev2d_POST, ARM64::qsub0); + break; + } + case ARM64ISD::LD4post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 4, ARM64::LD4Fourv8b_POST, ARM64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 4, ARM64::LD4Fourv16b_POST, ARM64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 4, ARM64::LD4Fourv4h_POST, ARM64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 4, ARM64::LD4Fourv8h_POST, ARM64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 4, ARM64::LD4Fourv2s_POST, ARM64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 4, ARM64::LD4Fourv4s_POST, ARM64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 4, ARM64::LD1Fourv1d_POST, ARM64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 4, ARM64::LD4Fourv2d_POST, ARM64::qsub0); + break; + } + case ARM64ISD::LD1x2post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 2, ARM64::LD1Twov8b_POST, ARM64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 2, ARM64::LD1Twov16b_POST, ARM64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 2, ARM64::LD1Twov4h_POST, ARM64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 2, ARM64::LD1Twov8h_POST, ARM64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 2, ARM64::LD1Twov2s_POST, ARM64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 2, ARM64::LD1Twov4s_POST, ARM64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 2, ARM64::LD1Twov1d_POST, ARM64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 2, ARM64::LD1Twov2d_POST, ARM64::qsub0); + break; + } + case ARM64ISD::LD1x3post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 3, ARM64::LD1Threev8b_POST, ARM64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 3, ARM64::LD1Threev16b_POST, ARM64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 3, ARM64::LD1Threev4h_POST, ARM64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 3, ARM64::LD1Threev8h_POST, ARM64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 3, ARM64::LD1Threev2s_POST, ARM64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 3, ARM64::LD1Threev4s_POST, ARM64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 3, ARM64::LD1Threev1d_POST, ARM64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 3, ARM64::LD1Threev2d_POST, ARM64::qsub0); + break; + } + case ARM64ISD::LD1x4post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 4, ARM64::LD1Fourv8b_POST, ARM64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 4, ARM64::LD1Fourv16b_POST, ARM64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 4, ARM64::LD1Fourv4h_POST, ARM64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 4, ARM64::LD1Fourv8h_POST, ARM64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 4, ARM64::LD1Fourv2s_POST, ARM64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 4, ARM64::LD1Fourv4s_POST, ARM64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 4, ARM64::LD1Fourv1d_POST, ARM64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 4, ARM64::LD1Fourv2d_POST, ARM64::qsub0); + break; + } + case ARM64ISD::LD2DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 2, ARM64::LD2Rv8b_POST, ARM64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 2, ARM64::LD2Rv16b_POST, ARM64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 2, ARM64::LD2Rv4h_POST, ARM64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 2, ARM64::LD2Rv8h_POST, ARM64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 2, ARM64::LD2Rv2s_POST, ARM64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 2, ARM64::LD2Rv4s_POST, ARM64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 2, ARM64::LD2Rv1d_POST, ARM64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 2, ARM64::LD2Rv2d_POST, ARM64::qsub0); + break; + } + case ARM64ISD::LD3DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 3, ARM64::LD3Rv8b_POST, ARM64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 3, ARM64::LD3Rv16b_POST, ARM64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 3, ARM64::LD3Rv4h_POST, ARM64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 3, ARM64::LD3Rv8h_POST, ARM64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 3, ARM64::LD3Rv2s_POST, ARM64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 3, ARM64::LD3Rv4s_POST, ARM64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 3, ARM64::LD3Rv1d_POST, ARM64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 3, ARM64::LD3Rv2d_POST, ARM64::qsub0); + break; + } + case ARM64ISD::LD4DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 4, ARM64::LD4Rv8b_POST, ARM64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 4, ARM64::LD4Rv16b_POST, ARM64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 4, ARM64::LD4Rv4h_POST, ARM64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 4, ARM64::LD4Rv8h_POST, ARM64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 4, ARM64::LD4Rv2s_POST, ARM64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 4, ARM64::LD4Rv4s_POST, ARM64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 4, ARM64::LD4Rv1d_POST, ARM64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 4, ARM64::LD4Rv2d_POST, ARM64::qsub0); + break; + } + case ARM64ISD::LD2LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 2, ARM64::LD2i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostLoadLane(Node, 2, ARM64::LD2i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 2, ARM64::LD2i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 2, ARM64::LD2i64_POST); + break; + } + case ARM64ISD::LD3LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 3, ARM64::LD3i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostLoadLane(Node, 3, ARM64::LD3i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 3, ARM64::LD3i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 3, ARM64::LD3i64_POST); + break; + } + case ARM64ISD::LD4LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 4, ARM64::LD4i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostLoadLane(Node, 4, ARM64::LD4i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 4, ARM64::LD4i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 4, ARM64::LD4i64_POST); + break; + } + case ARM64ISD::ST2post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 2, ARM64::ST2Twov8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 2, ARM64::ST2Twov16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 2, ARM64::ST2Twov4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 2, ARM64::ST2Twov8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 2, ARM64::ST2Twov2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 2, ARM64::ST2Twov4s_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 2, ARM64::ST2Twov2d_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 2, ARM64::ST1Twov1d_POST); + break; + } + case ARM64ISD::ST3post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 3, ARM64::ST3Threev8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 3, ARM64::ST3Threev16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 3, ARM64::ST3Threev4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 3, ARM64::ST3Threev8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 3, ARM64::ST3Threev2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 3, ARM64::ST3Threev4s_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 3, ARM64::ST3Threev2d_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 3, ARM64::ST1Threev1d_POST); + break; + } + case ARM64ISD::ST4post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 4, ARM64::ST4Fourv8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 4, ARM64::ST4Fourv16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 4, ARM64::ST4Fourv4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 4, ARM64::ST4Fourv8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 4, ARM64::ST4Fourv2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 4, ARM64::ST4Fourv4s_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 4, ARM64::ST4Fourv2d_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 4, ARM64::ST1Fourv1d_POST); + break; + } + case ARM64ISD::ST1x2post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 2, ARM64::ST1Twov8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 2, ARM64::ST1Twov16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 2, ARM64::ST1Twov4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 2, ARM64::ST1Twov8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 2, ARM64::ST1Twov2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 2, ARM64::ST1Twov4s_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 2, ARM64::ST1Twov1d_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 2, ARM64::ST1Twov2d_POST); + break; + } + case ARM64ISD::ST1x3post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 3, ARM64::ST1Threev8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 3, ARM64::ST1Threev16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 3, ARM64::ST1Threev4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 3, ARM64::ST1Threev8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 3, ARM64::ST1Threev2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 3, ARM64::ST1Threev4s_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 3, ARM64::ST1Threev1d_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 3, ARM64::ST1Threev2d_POST); + break; + } + case ARM64ISD::ST1x4post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 4, ARM64::ST1Fourv8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 4, ARM64::ST1Fourv16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 4, ARM64::ST1Fourv4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 4, ARM64::ST1Fourv8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 4, ARM64::ST1Fourv2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 4, ARM64::ST1Fourv4s_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 4, ARM64::ST1Fourv1d_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 4, ARM64::ST1Fourv2d_POST); + break; + } + case ARM64ISD::ST2LANEpost: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostStoreLane(Node, 2, ARM64::ST2i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostStoreLane(Node, 2, ARM64::ST2i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostStoreLane(Node, 2, ARM64::ST2i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostStoreLane(Node, 2, ARM64::ST2i64_POST); + break; + } + case ARM64ISD::ST3LANEpost: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostStoreLane(Node, 3, ARM64::ST3i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostStoreLane(Node, 3, ARM64::ST3i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostStoreLane(Node, 3, ARM64::ST3i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostStoreLane(Node, 3, ARM64::ST3i64_POST); + break; + } + case ARM64ISD::ST4LANEpost: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostStoreLane(Node, 4, ARM64::ST4i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostStoreLane(Node, 4, ARM64::ST4i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostStoreLane(Node, 4, ARM64::ST4i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostStoreLane(Node, 4, ARM64::ST4i64_POST); + break; + } case ISD::FCEIL: case ISD::FFLOOR: diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp index 4c6d7648d57..bff6ba060fb 100644 --- a/lib/Target/ARM64/ARM64ISelLowering.cpp +++ b/lib/Target/ARM64/ARM64ISelLowering.cpp @@ -369,6 +369,9 @@ ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM) setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; @@ -729,6 +732,27 @@ const char *ARM64TargetLowering::getTargetNodeName(unsigned Opcode) const { case ARM64ISD::URSHR_I: return "ARM64ISD::URSHR_I"; case ARM64ISD::SQSHLU_I: return "ARM64ISD::SQSHLU_I"; case ARM64ISD::WrapperLarge: return "ARM64ISD::WrapperLarge"; + case ARM64ISD::LD2post: return "ARM64ISD::LD2post"; + case ARM64ISD::LD3post: return "ARM64ISD::LD3post"; + case ARM64ISD::LD4post: return "ARM64ISD::LD4post"; + case ARM64ISD::ST2post: return "ARM64ISD::ST2post"; + case ARM64ISD::ST3post: return "ARM64ISD::ST3post"; + case ARM64ISD::ST4post: return "ARM64ISD::ST4post"; + case ARM64ISD::LD1x2post: return "ARM64ISD::LD1x2post"; + case ARM64ISD::LD1x3post: return "ARM64ISD::LD1x3post"; + case ARM64ISD::LD1x4post: return "ARM64ISD::LD1x4post"; + case ARM64ISD::ST1x2post: return "ARM64ISD::ST1x2post"; + case ARM64ISD::ST1x3post: return "ARM64ISD::ST1x3post"; + case ARM64ISD::ST1x4post: return "ARM64ISD::ST1x4post"; + case ARM64ISD::LD2DUPpost: return "ARM64ISD::LD2DUPpost"; + case ARM64ISD::LD3DUPpost: return "ARM64ISD::LD3DUPpost"; + case ARM64ISD::LD4DUPpost: return "ARM64ISD::LD4DUPpost"; + case ARM64ISD::LD2LANEpost: return "ARM64ISD::LD2LANEpost"; + case ARM64ISD::LD3LANEpost: return "ARM64ISD::LD3LANEpost"; + case ARM64ISD::LD4LANEpost: return "ARM64ISD::LD4LANEpost"; + case ARM64ISD::ST2LANEpost: return "ARM64ISD::ST2LANEpost"; + case ARM64ISD::ST3LANEpost: return "ARM64ISD::ST3LANEpost"; + case ARM64ISD::ST4LANEpost: return "ARM64ISD::ST4LANEpost"; } } @@ -5683,6 +5707,9 @@ bool ARM64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm64_neon_ld2: case Intrinsic::arm64_neon_ld3: case Intrinsic::arm64_neon_ld4: + case Intrinsic::arm64_neon_ld1x2: + case Intrinsic::arm64_neon_ld1x3: + case Intrinsic::arm64_neon_ld1x4: case Intrinsic::arm64_neon_ld2lane: case Intrinsic::arm64_neon_ld3lane: case Intrinsic::arm64_neon_ld4lane: @@ -5704,6 +5731,9 @@ bool ARM64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm64_neon_st2: case Intrinsic::arm64_neon_st3: case Intrinsic::arm64_neon_st4: + case Intrinsic::arm64_neon_st1x2: + case Intrinsic::arm64_neon_st1x3: + case Intrinsic::arm64_neon_st1x4: case Intrinsic::arm64_neon_st2lane: case Intrinsic::arm64_neon_st3lane: case Intrinsic::arm64_neon_st4lane: { @@ -7038,6 +7068,138 @@ static SDValue performSTORECombine(SDNode *N, S->getAlignment()); } +/// Target-specific DAG combine function for NEON load/store intrinsics +/// to merge base address updates. +static SDValue performNEONPostLDSTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + unsigned AddrOpIdx = N->getNumOperands() - 1; + SDValue Addr = N->getOperand(AddrOpIdx); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // Find the new opcode for the updating load/store. + bool IsStore = false; + bool IsLaneOp = false; + bool IsDupOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: llvm_unreachable("unexpected intrinsic for Neon base update"); + case Intrinsic::arm64_neon_ld2: NewOpc = ARM64ISD::LD2post; + NumVecs = 2; break; + case Intrinsic::arm64_neon_ld3: NewOpc = ARM64ISD::LD3post; + NumVecs = 3; break; + case Intrinsic::arm64_neon_ld4: NewOpc = ARM64ISD::LD4post; + NumVecs = 4; break; + case Intrinsic::arm64_neon_st2: NewOpc = ARM64ISD::ST2post; + NumVecs = 2; IsStore = true; break; + case Intrinsic::arm64_neon_st3: NewOpc = ARM64ISD::ST3post; + NumVecs = 3; IsStore = true; break; + case Intrinsic::arm64_neon_st4: NewOpc = ARM64ISD::ST4post; + NumVecs = 4; IsStore = true; break; + case Intrinsic::arm64_neon_ld1x2: NewOpc = ARM64ISD::LD1x2post; + NumVecs = 2; break; + case Intrinsic::arm64_neon_ld1x3: NewOpc = ARM64ISD::LD1x3post; + NumVecs = 3; break; + case Intrinsic::arm64_neon_ld1x4: NewOpc = ARM64ISD::LD1x4post; + NumVecs = 4; break; + case Intrinsic::arm64_neon_st1x2: NewOpc = ARM64ISD::ST1x2post; + NumVecs = 2; IsStore = true; break; + case Intrinsic::arm64_neon_st1x3: NewOpc = ARM64ISD::ST1x3post; + NumVecs = 3; IsStore = true; break; + case Intrinsic::arm64_neon_st1x4: NewOpc = ARM64ISD::ST1x4post; + NumVecs = 4; IsStore = true; break; + case Intrinsic::arm64_neon_ld2r: NewOpc = ARM64ISD::LD2DUPpost; + NumVecs = 2; IsDupOp = true; break; + case Intrinsic::arm64_neon_ld3r: NewOpc = ARM64ISD::LD3DUPpost; + NumVecs = 3; IsDupOp = true; break; + case Intrinsic::arm64_neon_ld4r: NewOpc = ARM64ISD::LD4DUPpost; + NumVecs = 4; IsDupOp = true; break; + case Intrinsic::arm64_neon_ld2lane: NewOpc = ARM64ISD::LD2LANEpost; + NumVecs = 2; IsLaneOp = true; break; + case Intrinsic::arm64_neon_ld3lane: NewOpc = ARM64ISD::LD3LANEpost; + NumVecs = 3; IsLaneOp = true; break; + case Intrinsic::arm64_neon_ld4lane: NewOpc = ARM64ISD::LD4LANEpost; + NumVecs = 4; IsLaneOp = true; break; + case Intrinsic::arm64_neon_st2lane: NewOpc = ARM64ISD::ST2LANEpost; + NumVecs = 2; IsStore = true; IsLaneOp = true; break; + case Intrinsic::arm64_neon_st3lane: NewOpc = ARM64ISD::ST3LANEpost; + NumVecs = 3; IsStore = true; IsLaneOp = true; break; + case Intrinsic::arm64_neon_st4lane: NewOpc = ARM64ISD::ST4LANEpost; + NumVecs = 4; IsStore = true; IsLaneOp = true; break; + } + + EVT VecTy; + if (IsStore) + VecTy = N->getOperand(2).getValueType(); + else + VecTy = N->getValueType(0); + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { + uint32_t IncVal = CInc->getZExtValue(); + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (IsLaneOp || IsDupOp) + NumBytes /= VecTy.getVectorNumElements(); + if (IncVal != NumBytes) + continue; + Inc = DAG.getRegister(ARM64::XZR, MVT::i64); + } + SmallVector Ops; + Ops.push_back(N->getOperand(0)); // Incoming chain + // Load lane and store have vector list as input. + if (IsLaneOp || IsStore) + for (unsigned i = 2; i < AddrOpIdx; ++i) + Ops.push_back(N->getOperand(i)); + Ops.push_back(N->getOperand(AddrOpIdx)); // Base register + Ops.push_back(Inc); + + // Return Types. + EVT Tys[6]; + unsigned NumResultVecs = (IsStore ? 0 : NumVecs); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i64; // Type of write back register + Tys[n] = MVT::Other; // Type of the chain + SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); + + MemIntrinsicSDNode *MemInt = cast(N); + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, + MemInt->getMemoryVT(), + MemInt->getMemOperand()); + + // Update the uses. + std::vector NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) { + NewResults.push_back(SDValue(UpdN.getNode(), i)); + } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + return SDValue(); +} + // Optimize compare with zero and branch. static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, @@ -7196,6 +7358,34 @@ SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N, return performSTORECombine(N, DCI, DAG, Subtarget); case ARM64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + switch (cast(N->getOperand(1))->getZExtValue()) { + case Intrinsic::arm64_neon_ld2: + case Intrinsic::arm64_neon_ld3: + case Intrinsic::arm64_neon_ld4: + case Intrinsic::arm64_neon_ld1x2: + case Intrinsic::arm64_neon_ld1x3: + case Intrinsic::arm64_neon_ld1x4: + case Intrinsic::arm64_neon_ld2lane: + case Intrinsic::arm64_neon_ld3lane: + case Intrinsic::arm64_neon_ld4lane: + case Intrinsic::arm64_neon_ld2r: + case Intrinsic::arm64_neon_ld3r: + case Intrinsic::arm64_neon_ld4r: + case Intrinsic::arm64_neon_st2: + case Intrinsic::arm64_neon_st3: + case Intrinsic::arm64_neon_st4: + case Intrinsic::arm64_neon_st1x2: + case Intrinsic::arm64_neon_st1x3: + case Intrinsic::arm64_neon_st1x4: + case Intrinsic::arm64_neon_st2lane: + case Intrinsic::arm64_neon_st3lane: + case Intrinsic::arm64_neon_st4lane: + return performNEONPostLDSTCombine(N, DCI, DAG); + default: + break; + } } return SDValue(); } diff --git a/lib/Target/ARM64/ARM64ISelLowering.h b/lib/Target/ARM64/ARM64ISelLowering.h index aa27b2d43d2..8b321ee9d03 100644 --- a/lib/Target/ARM64/ARM64ISelLowering.h +++ b/lib/Target/ARM64/ARM64ISelLowering.h @@ -160,7 +160,30 @@ enum { // {s|u}int to FP within a FP register. SITOF, - UITOF + UITOF, + + // NEON Load/Store with post-increment base updates + LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, + LD3post, + LD4post, + ST2post, + ST3post, + ST4post, + LD1x2post, + LD1x3post, + LD1x4post, + ST1x2post, + ST1x3post, + ST1x4post, + LD2DUPpost, + LD3DUPpost, + LD4DUPpost, + LD2LANEpost, + LD3LANEpost, + LD4LANEpost, + ST2LANEpost, + ST3LANEpost, + ST4LANEpost }; } // end namespace ARM64ISD diff --git a/test/CodeGen/ARM64/indexed-vector-ldst.ll b/test/CodeGen/ARM64/indexed-vector-ldst.ll index c909a447e1e..4e951f9d2f7 100644 --- a/test/CodeGen/ARM64/indexed-vector-ldst.ll +++ b/test/CodeGen/ARM64/indexed-vector-ldst.ll @@ -611,3 +611,5080 @@ define float* @test_v2f32_post_reg_st1_lane(<2 x float> %in, float* %addr) { %newaddr = getelementptr float* %addr, i32 2 ret float* %newaddr } + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld2: +;CHECK: ld2.16b { v0, v1 }, [x0], #32 + %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 32 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8> } %ld2 +} + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld2: +;CHECK: ld2.16b { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8> } %ld2 +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld2: +;CHECK: ld2.8b { v0, v1 }, [x0], #16 + %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 16 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8> } %ld2 +} + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld2: +;CHECK: ld2.8b { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8> } %ld2 +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld2: +;CHECK: ld2.8h { v0, v1 }, [x0], #32 + %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 16 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16> } %ld2 +} + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld2: +;CHECK: ld2.8h { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16> } %ld2 +} + +declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld2: +;CHECK: ld2.4h { v0, v1 }, [x0], #16 + %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 8 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16> } %ld2 +} + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld2: +;CHECK: ld2.4h { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16> } %ld2 +} + +declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld2: +;CHECK: ld2.4s { v0, v1 }, [x0], #32 + %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 8 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32> } %ld2 +} + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld2: +;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32> } %ld2 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld2: +;CHECK: ld2.2s { v0, v1 }, [x0], #16 + %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 4 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32> } %ld2 +} + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld2: +;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32> } %ld2 +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld2: +;CHECK: ld2.2d { v0, v1 }, [x0], #32 + %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 4 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64> } %ld2 +} + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld2: +;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64> } %ld2 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld2: +;CHECK: ld1.1d { v0, v1 }, [x0], #16 + %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 2 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64> } %ld2 +} + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld2: +;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64> } %ld2 +} + +declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld2: +;CHECK: ld2.4s { v0, v1 }, [x0], #32 + %ld2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 8 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float> } %ld2 +} + +define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld2: +;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float> } %ld2 +} + +declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld2: +;CHECK: ld2.2s { v0, v1 }, [x0], #16 + %ld2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 4 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float> } %ld2 +} + +define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld2: +;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float> } %ld2 +} + +declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld2: +;CHECK: ld2.2d { v0, v1 }, [x0], #32 + %ld2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 4 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double> } %ld2 +} + +define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld2: +;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double> } %ld2 +} + +declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld2: +;CHECK: ld1.1d { v0, v1 }, [x0], #16 + %ld2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 2 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double> } %ld2 +} + +define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld2: +;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double> } %ld2 +} + +declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double*) + + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld3: +;CHECK: ld3.16b { v0, v1, v2 }, [x0], #48 + %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 48 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld3: +;CHECK: ld3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld3: +;CHECK: ld3.8b { v0, v1, v2 }, [x0], #24 + %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 24 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld3: +;CHECK: ld3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld3: +;CHECK: ld3.8h { v0, v1, v2 }, [x0], #48 + %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 24 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld3: +;CHECK: ld3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld3: +;CHECK: ld3.4h { v0, v1, v2 }, [x0], #24 + %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 12 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld3: +;CHECK: ld3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld3: +;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48 + %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 12 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld3: +;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld3: +;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24 + %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 6 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld3: +;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld3: +;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48 + %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 6 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld3: +;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24 + %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 3 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld3: +;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48 + %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 12 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld3: +;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld3: +;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24 + %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 6 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld3: +;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld3: +;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48 + %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 6 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld3: +;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24 + %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 3 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double*) + + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld4: +;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64 + %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 64 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld4: +;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld4: +;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32 + %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 32 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld4: +;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld4: +;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64 + %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 32 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld4: +;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld4: +;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32 + %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 16 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld4: +;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld4: +;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64 + %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 16 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld4: +;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld4: +;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32 + %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 8 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld4: +;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld4: +;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64 + %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 8 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld4: +;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32 + %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 4 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld4: +;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64 + %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 16 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld4: +;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld4: +;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32 + %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 8 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld4: +;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld4: +;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64 + %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 8 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld4: +;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32 + %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 4 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double*) + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x2(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld1x2: +;CHECK: ld1.16b { v0, v1 }, [x0], #32 + %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 32 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8> } %ld1x2 +} + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld1x2: +;CHECK: ld1.16b { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8> } %ld1x2 +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x2(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld1x2: +;CHECK: ld1.8b { v0, v1 }, [x0], #16 + %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 16 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8> } %ld1x2 +} + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld1x2: +;CHECK: ld1.8b { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8> } %ld1x2 +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x2(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld1x2: +;CHECK: ld1.8h { v0, v1 }, [x0], #32 + %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 16 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16> } %ld1x2 +} + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld1x2: +;CHECK: ld1.8h { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16> } %ld1x2 +} + +declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x2(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld1x2: +;CHECK: ld1.4h { v0, v1 }, [x0], #16 + %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 8 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16> } %ld1x2 +} + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld1x2: +;CHECK: ld1.4h { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16> } %ld1x2 +} + +declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x2(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld1x2: +;CHECK: ld1.4s { v0, v1 }, [x0], #32 + %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 8 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32> } %ld1x2 +} + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld1x2: +;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32> } %ld1x2 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x2(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld1x2: +;CHECK: ld1.2s { v0, v1 }, [x0], #16 + %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 4 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32> } %ld1x2 +} + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld1x2: +;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32> } %ld1x2 +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x2(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld1x2: +;CHECK: ld1.2d { v0, v1 }, [x0], #32 + %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 4 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64> } %ld1x2 +} + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld1x2: +;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64> } %ld1x2 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x2(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld1x2: +;CHECK: ld1.1d { v0, v1 }, [x0], #16 + %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 2 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64> } %ld1x2 +} + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld1x2: +;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64> } %ld1x2 +} + +declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x2(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld1x2: +;CHECK: ld1.4s { v0, v1 }, [x0], #32 + %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 8 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float> } %ld1x2 +} + +define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld1x2: +;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float> } %ld1x2 +} + +declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x2(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld1x2: +;CHECK: ld1.2s { v0, v1 }, [x0], #16 + %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 4 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float> } %ld1x2 +} + +define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld1x2: +;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float> } %ld1x2 +} + +declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x2(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld1x2: +;CHECK: ld1.2d { v0, v1 }, [x0], #32 + %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 4 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double> } %ld1x2 +} + +define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld1x2: +;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double> } %ld1x2 +} + +declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x2(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld1x2: +;CHECK: ld1.1d { v0, v1 }, [x0], #16 + %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 2 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double> } %ld1x2 +} + +define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld1x2: +;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}} + %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double> } %ld1x2 +} + +declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*) + + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x3(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld1x3: +;CHECK: ld1.16b { v0, v1, v2 }, [x0], #48 + %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 48 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3 +} + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld1x3: +;CHECK: ld1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x3(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld1x3: +;CHECK: ld1.8b { v0, v1, v2 }, [x0], #24 + %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 24 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3 +} + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld1x3: +;CHECK: ld1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x3(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld1x3: +;CHECK: ld1.8h { v0, v1, v2 }, [x0], #48 + %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 24 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3 +} + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld1x3: +;CHECK: ld1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x3(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld1x3: +;CHECK: ld1.4h { v0, v1, v2 }, [x0], #24 + %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 12 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3 +} + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld1x3: +;CHECK: ld1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x3(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld1x3: +;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48 + %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 12 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3 +} + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld1x3: +;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x3(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld1x3: +;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24 + %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 6 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3 +} + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld1x3: +;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x3(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld1x3: +;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48 + %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 6 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3 +} + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld1x3: +;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x3(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld1x3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24 + %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 3 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3 +} + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld1x3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x3(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld1x3: +;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48 + %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 12 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3 +} + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld1x3: +;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3 +} + +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x3(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld1x3: +;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24 + %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 6 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3 +} + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld1x3: +;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3 +} + +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x3(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld1x3: +;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48 + %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 6 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3 +} + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld1x3: +;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3 +} + +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x3(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld1x3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24 + %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 3 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3 +} + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld1x3: +;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3 +} + +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*) + + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x4(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v16i8_post_imm_ld1x4: +;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], #64 + %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 64 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4 +} + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v16i8_post_reg_ld1x4: +;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*) + + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x4(i8* %A, i8** %ptr) { +;CHECK-LABEL: test_v8i8_post_imm_ld1x4: +;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], #32 + %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 32 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4 +} + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i8_post_reg_ld1x4: +;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*) + + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x4(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v8i16_post_imm_ld1x4: +;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], #64 + %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 32 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4 +} + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v8i16_post_reg_ld1x4: +;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*) + + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x4(i16* %A, i16** %ptr) { +;CHECK-LABEL: test_v4i16_post_imm_ld1x4: +;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], #32 + %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 16 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4 +} + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i16_post_reg_ld1x4: +;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*) + + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x4(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v4i32_post_imm_ld1x4: +;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64 + %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 16 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4 +} + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4i32_post_reg_ld1x4: +;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*) + + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x4(i32* %A, i32** %ptr) { +;CHECK-LABEL: test_v2i32_post_imm_ld1x4: +;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32 + %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 8 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4 +} + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i32_post_reg_ld1x4: +;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*) + + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x4(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v2i64_post_imm_ld1x4: +;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64 + %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 8 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4 +} + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2i64_post_reg_ld1x4: +;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*) + + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x4(i64* %A, i64** %ptr) { +;CHECK-LABEL: test_v1i64_post_imm_ld1x4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32 + %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 4 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4 +} + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1i64_post_reg_ld1x4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*) + + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x4(float* %A, float** %ptr) { +;CHECK-LABEL: test_v4f32_post_imm_ld1x4: +;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64 + %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 16 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4 +} + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v4f32_post_reg_ld1x4: +;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4 +} + +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*) + + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x4(float* %A, float** %ptr) { +;CHECK-LABEL: test_v2f32_post_imm_ld1x4: +;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32 + %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 8 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4 +} + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f32_post_reg_ld1x4: +;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4 +} + +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*) + + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x4(double* %A, double** %ptr) { +;CHECK-LABEL: test_v2f64_post_imm_ld1x4: +;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64 + %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 8 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4 +} + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v2f64_post_reg_ld1x4: +;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4 +} + +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*) + + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x4(double* %A, double** %ptr) { +;CHECK-LABEL: test_v1f64_post_imm_ld1x4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32 + %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 4 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4 +} + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) { +;CHECK-LABEL: test_v1f64_post_reg_ld1x4: +;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4 +} + +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*) + + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld2r: +;CHECK: ld2r.16b { v0, v1 }, [x0], #2 + %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 2 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8> } %ld2 +} + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld2r: +;CHECK: ld2r.16b { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8> } %ld2 +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld2r: +;CHECK: ld2r.8b { v0, v1 }, [x0], #2 + %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 2 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8> } %ld2 +} + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld2r: +;CHECK: ld2r.8b { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8> } %ld2 +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld2r: +;CHECK: ld2r.8h { v0, v1 }, [x0], #4 + %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 2 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16> } %ld2 +} + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld2r: +;CHECK: ld2r.8h { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16> } %ld2 +} + +declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld2r: +;CHECK: ld2r.4h { v0, v1 }, [x0], #4 + %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 2 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16> } %ld2 +} + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld2r: +;CHECK: ld2r.4h { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16> } %ld2 +} + +declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld2r: +;CHECK: ld2r.4s { v0, v1 }, [x0], #8 + %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 2 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32> } %ld2 +} + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld2r: +;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32> } %ld2 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld2r: +;CHECK: ld2r.2s { v0, v1 }, [x0], #8 + %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 2 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32> } %ld2 +} + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld2r: +;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32> } %ld2 +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld2r: +;CHECK: ld2r.2d { v0, v1 }, [x0], #16 + %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 2 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64> } %ld2 +} + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld2r: +;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64> } %ld2 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld2r: +;CHECK: ld2r.1d { v0, v1 }, [x0], #16 + %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 2 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64> } %ld2 +} + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld2r: +;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64> } %ld2 +} + +declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly + + +define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld2r: +;CHECK: ld2r.4s { v0, v1 }, [x0], #8 + %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 2 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float> } %ld2 +} + +define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld2r: +;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float> } %ld2 +} + +declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly + +define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld2r: +;CHECK: ld2r.2s { v0, v1 }, [x0], #8 + %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 2 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float> } %ld2 +} + +define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld2r: +;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float> } %ld2 +} + +declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly + + +define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld2r: +;CHECK: ld2r.2d { v0, v1 }, [x0], #16 + %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 2 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double> } %ld2 +} + +define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld2r: +;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double> } %ld2 +} + +declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly + +define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld2r: +;CHECK: ld2r.1d { v0, v1 }, [x0], #16 + %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 2 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double> } %ld2 +} + +define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld2r: +;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}} + %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double> } %ld2 +} + +declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly + + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld3r: +;CHECK: ld3r.16b { v0, v1, v2 }, [x0], #3 + %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 3 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld3r: +;CHECK: ld3r.16b { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld3r: +;CHECK: ld3r.8b { v0, v1, v2 }, [x0], #3 + %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 3 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld3r: +;CHECK: ld3r.8b { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld3r: +;CHECK: ld3r.8h { v0, v1, v2 }, [x0], #6 + %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 3 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld3r: +;CHECK: ld3r.8h { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld3r: +;CHECK: ld3r.4h { v0, v1, v2 }, [x0], #6 + %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 3 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld3r: +;CHECK: ld3r.4h { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld3r: +;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12 + %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 3 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld3r: +;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld3r: +;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12 + %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 3 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld3r: +;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld3r: +;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24 + %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 3 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld3r: +;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld3r: +;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24 + %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 3 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld3r: +;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly + + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld3r: +;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12 + %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 3 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld3r: +;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld3r: +;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12 + %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 3 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld3r: +;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly + + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld3r: +;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24 + %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 3 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld3r: +;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld3r: +;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24 + %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 3 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld3r: +;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} + %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly + + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld4r: +;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], #4 + %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 4 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld4r: +;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld4r: +;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], #4 + %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i32 4 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld4r: +;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld4r: +;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], #8 + %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 4 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld4r: +;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld4r: +;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], #8 + %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i32 4 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld4r: +;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld4r: +;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16 + %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 4 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld4r: +;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld4r: +;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16 + %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i32 4 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld4r: +;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld4r: +;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32 + %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 4 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld4r: +;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld4r: +;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32 + %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i32 4 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld4r: +;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly + + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld4r: +;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16 + %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 4 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld4r: +;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4r(float* %A, float** %ptr) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld4r: +;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16 + %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i32 4 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld4r: +;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly + + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld4r: +;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32 + %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 4 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld4r: +;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4r(double* %A, double** %ptr) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld4r: +;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32 + %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i32 4 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld4r: +;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly + + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld2lane: +;CHECK: ld2.b { v0, v1 }[0], [x0], #2 + %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 2 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8> } %ld2 +} + +define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld2lane: +;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8> } %ld2 +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld2lane: +;CHECK: ld2.b { v0, v1 }[0], [x0], #2 + %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 2 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8> } %ld2 +} + +define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld2lane: +;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8> } %ld2 +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld2lane: +;CHECK: ld2.h { v0, v1 }[0], [x0], #4 + %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 2 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16> } %ld2 +} + +define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld2lane: +;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16> } %ld2 +} + +declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld2lane: +;CHECK: ld2.h { v0, v1 }[0], [x0], #4 + %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 2 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16> } %ld2 +} + +define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld2lane: +;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16> } %ld2 +} + +declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], #8 + %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 2 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32> } %ld2 +} + +define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32> } %ld2 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], #8 + %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 2 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32> } %ld2 +} + +define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32> } %ld2 +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], #16 + %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i32 2 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64> } %ld2 +} + +define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64> } %ld2 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly + + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], #16 + %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i32 2 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64> } %ld2 +} + +define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64> } %ld2 +} + +declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly + + +define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], #8 + %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 2 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float> } %ld2 +} + +define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float> } %ld2 +} + +declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly + + +define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], #8 + %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 2 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float> } %ld2 +} + +define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld2lane: +;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float> } %ld2 +} + +declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly + + +define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], #16 + %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A) + %tmp = getelementptr double* %A, i32 2 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double> } %ld2 +} + +define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double> } %ld2 +} + +declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly + + +define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], #16 + %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A) + %tmp = getelementptr double* %A, i32 2 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double> } %ld2 +} + +define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld2lane: +;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}} + %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double> } %ld2 +} + +declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly + + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld3lane: +;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3 + %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 3 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld3lane: +;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld3lane: +;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3 + %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 3 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld3lane: +;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld3lane: +;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6 + %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 3 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld3lane: +;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld3lane: +;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6 + %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 3 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld3lane: +;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12 + %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 3 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12 + %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 3 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24 + %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i32 3 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly + + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24 + %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i32 3 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly + + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12 + %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 3 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float> } %ld3 +} + +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly + + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12 + %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 3 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld3lane: +;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float> } %ld3 +} + +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly + + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24 + %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A) + %tmp = getelementptr double* %A, i32 3 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double> } %ld3 +} + +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly + + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24 + %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A) + %tmp = getelementptr double* %A, i32 3 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld3lane: +;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double> } %ld3 +} + +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly + + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_ld4lane: +;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4 + %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 4 + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_ld4lane: +;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_ld4lane: +;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4 + %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 4 + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_ld4lane: +;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + store i8* %tmp, i8** %ptr + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4 +} + +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly + + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_ld4lane: +;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8 + %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 4 + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_ld4lane: +;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4 +} + +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_ld4lane: +;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8 + %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 4 + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_ld4lane: +;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + store i16* %tmp, i16** %ptr + ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4 +} + +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly + + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16 + %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 4 + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4 +} + +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16 + %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 4 + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + store i32* %tmp, i32** %ptr + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4 +} + +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly + + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32 + %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i32 4 + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4 +} + +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly + + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32 + %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i32 4 + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + store i64* %tmp, i64** %ptr + ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4 +} + +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly + + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16 + %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 4 + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4 +} + +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly + + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16 + %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 4 + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_ld4lane: +;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + store float* %tmp, float** %ptr + ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4 +} + +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly + + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32 + %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A) + %tmp = getelementptr double* %A, i32 4 + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4 +} + +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly + + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32 + %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A) + %tmp = getelementptr double* %A, i32 4 + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_ld4lane: +;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + store double* %tmp, double** %ptr + ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4 +} + +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly + + +define i8* @test_v16i8_post_imm_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st2: +;CHECK: st2.16b { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A) + %tmp = getelementptr i8* %A, i32 32 + ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st2: +;CHECK: st2.16b { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st2: +;CHECK: st2.8b { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A) + %tmp = getelementptr i8* %A, i32 16 + ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st2: +;CHECK: st2.8b { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st2: +;CHECK: st2.8h { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A) + %tmp = getelementptr i16* %A, i32 16 + ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st2: +;CHECK: st2.8h { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st2: +;CHECK: st2.4h { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A) + %tmp = getelementptr i16* %A, i32 8 + ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st2: +;CHECK: st2.4h { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) + + +define i32* @test_v4i32_post_imm_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st2: +;CHECK: st2.4s { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A) + %tmp = getelementptr i32* %A, i32 8 + ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st2: +;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) + + +define i32* @test_v2i32_post_imm_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st2: +;CHECK: st2.2s { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A) + %tmp = getelementptr i32* %A, i32 4 + ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st2: +;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st2: +;CHECK: st2.2d { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A) + %tmp = getelementptr i64* %A, i64 4 + ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st2: +;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) + + +define i64* @test_v1i64_post_imm_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st2: +;CHECK: st1.1d { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A) + %tmp = getelementptr i64* %A, i64 2 + ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st2: +;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) + + +define float* @test_v4f32_post_imm_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st2: +;CHECK: st2.4s { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A) + %tmp = getelementptr float* %A, i32 8 + ret float* %tmp +} + +define float* @test_v4f32_post_reg_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st2: +;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st2: +;CHECK: st2.2s { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A) + %tmp = getelementptr float* %A, i32 4 + ret float* %tmp +} + +define float* @test_v2f32_post_reg_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st2: +;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st2: +;CHECK: st2.2d { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A) + %tmp = getelementptr double* %A, i64 4 + ret double* %tmp +} + +define double* @test_v2f64_post_reg_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st2: +;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*) + + +define double* @test_v1f64_post_imm_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st2: +;CHECK: st1.1d { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A) + %tmp = getelementptr double* %A, i64 2 + ret double* %tmp +} + +define double* @test_v1f64_post_reg_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st2: +;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st3: +;CHECK: st3.16b { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A) + %tmp = getelementptr i8* %A, i32 48 + ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st3: +;CHECK: st3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st3: +;CHECK: st3.8b { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A) + %tmp = getelementptr i8* %A, i32 24 + ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st3: +;CHECK: st3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st3: +;CHECK: st3.8h { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A) + %tmp = getelementptr i16* %A, i32 24 + ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st3: +;CHECK: st3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st3: +;CHECK: st3.4h { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A) + %tmp = getelementptr i16* %A, i32 12 + ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st3: +;CHECK: st3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) + + +define i32* @test_v4i32_post_imm_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st3: +;CHECK: st3.4s { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A) + %tmp = getelementptr i32* %A, i32 12 + ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st3: +;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) + + +define i32* @test_v2i32_post_imm_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st3: +;CHECK: st3.2s { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A) + %tmp = getelementptr i32* %A, i32 6 + ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st3: +;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st3: +;CHECK: st3.2d { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A) + %tmp = getelementptr i64* %A, i64 6 + ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st3: +;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) + + +define i64* @test_v1i64_post_imm_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A) + %tmp = getelementptr i64* %A, i64 3 + ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) + + +define float* @test_v4f32_post_imm_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st3: +;CHECK: st3.4s { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A) + %tmp = getelementptr float* %A, i32 12 + ret float* %tmp +} + +define float* @test_v4f32_post_reg_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st3: +;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st3: +;CHECK: st3.2s { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A) + %tmp = getelementptr float* %A, i32 6 + ret float* %tmp +} + +define float* @test_v2f32_post_reg_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st3: +;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st3: +;CHECK: st3.2d { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A) + %tmp = getelementptr double* %A, i64 6 + ret double* %tmp +} + +define double* @test_v2f64_post_reg_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st3: +;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) + + +define double* @test_v1f64_post_imm_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A) + %tmp = getelementptr double* %A, i64 3 + ret double* %tmp +} + +define double* @test_v1f64_post_reg_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st4: +;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A) + %tmp = getelementptr i8* %A, i32 64 + ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st4: +;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st4: +;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A) + %tmp = getelementptr i8* %A, i32 32 + ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st4: +;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st4: +;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A) + %tmp = getelementptr i16* %A, i32 32 + ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st4: +;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st4: +;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A) + %tmp = getelementptr i16* %A, i32 16 + ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st4: +;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>, i16*) + + +define i32* @test_v4i32_post_imm_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st4: +;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A) + %tmp = getelementptr i32* %A, i32 16 + ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st4: +;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>, i32*) + + +define i32* @test_v2i32_post_imm_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st4: +;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A) + %tmp = getelementptr i32* %A, i32 8 + ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st4: +;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st4: +;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A) + %tmp = getelementptr i64* %A, i64 8 + ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st4: +;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>, i64*) + + +define i64* @test_v1i64_post_imm_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A) + %tmp = getelementptr i64* %A, i64 4 + ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>, i64*) + + +define float* @test_v4f32_post_imm_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st4: +;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A) + %tmp = getelementptr float* %A, i32 16 + ret float* %tmp +} + +define float* @test_v4f32_post_reg_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st4: +;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st4: +;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A) + %tmp = getelementptr float* %A, i32 8 + ret float* %tmp +} + +define float* @test_v2f32_post_reg_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st4: +;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st4: +;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A) + %tmp = getelementptr double* %A, i64 8 + ret double* %tmp +} + +define double* @test_v2f64_post_reg_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st4: +;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>, double*) + + +define double* @test_v1f64_post_imm_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A) + %tmp = getelementptr double* %A, i64 4 + ret double* %tmp +} + +define double* @test_v1f64_post_reg_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st1x2: +;CHECK: st1.16b { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A) + %tmp = getelementptr i8* %A, i32 32 + ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st1x2: +;CHECK: st1.16b { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st1x2: +;CHECK: st1.8b { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A) + %tmp = getelementptr i8* %A, i32 16 + ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st1x2: +;CHECK: st1.8b { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st1x2: +;CHECK: st1.8h { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A) + %tmp = getelementptr i16* %A, i32 16 + ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st1x2: +;CHECK: st1.8h { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st1x2: +;CHECK: st1.4h { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A) + %tmp = getelementptr i16* %A, i32 8 + ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st1x2: +;CHECK: st1.4h { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) + + +define i32* @test_v4i32_post_imm_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st1x2: +;CHECK: st1.4s { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A) + %tmp = getelementptr i32* %A, i32 8 + ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st1x2: +;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) + + +define i32* @test_v2i32_post_imm_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st1x2: +;CHECK: st1.2s { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A) + %tmp = getelementptr i32* %A, i32 4 + ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st1x2: +;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st1x2: +;CHECK: st1.2d { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A) + %tmp = getelementptr i64* %A, i64 4 + ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st1x2: +;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) + + +define i64* @test_v1i64_post_imm_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st1x2: +;CHECK: st1.1d { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A) + %tmp = getelementptr i64* %A, i64 2 + ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st1x2: +;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) + + +define float* @test_v4f32_post_imm_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st1x2: +;CHECK: st1.4s { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A) + %tmp = getelementptr float* %A, i32 8 + ret float* %tmp +} + +define float* @test_v4f32_post_reg_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st1x2: +;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st1x2: +;CHECK: st1.2s { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A) + %tmp = getelementptr float* %A, i32 4 + ret float* %tmp +} + +define float* @test_v2f32_post_reg_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st1x2: +;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st1x2: +;CHECK: st1.2d { v0, v1 }, [x0], #32 + call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A) + %tmp = getelementptr double* %A, i64 4 + ret double* %tmp +} + +define double* @test_v2f64_post_reg_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st1x2: +;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) + + +define double* @test_v1f64_post_imm_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st1x2: +;CHECK: st1.1d { v0, v1 }, [x0], #16 + call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A) + %tmp = getelementptr double* %A, i64 2 + ret double* %tmp +} + +define double* @test_v1f64_post_reg_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st1x2: +;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st1x3: +;CHECK: st1.16b { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A) + %tmp = getelementptr i8* %A, i32 48 + ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st1x3: +;CHECK: st1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st1x3: +;CHECK: st1.8b { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A) + %tmp = getelementptr i8* %A, i32 24 + ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st1x3: +;CHECK: st1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st1x3: +;CHECK: st1.8h { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A) + %tmp = getelementptr i16* %A, i32 24 + ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st1x3: +;CHECK: st1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st1x3: +;CHECK: st1.4h { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A) + %tmp = getelementptr i16* %A, i32 12 + ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st1x3: +;CHECK: st1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) + + +define i32* @test_v4i32_post_imm_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st1x3: +;CHECK: st1.4s { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A) + %tmp = getelementptr i32* %A, i32 12 + ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st1x3: +;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) + + +define i32* @test_v2i32_post_imm_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st1x3: +;CHECK: st1.2s { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A) + %tmp = getelementptr i32* %A, i32 6 + ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st1x3: +;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st1x3: +;CHECK: st1.2d { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A) + %tmp = getelementptr i64* %A, i64 6 + ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st1x3: +;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) + + +define i64* @test_v1i64_post_imm_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st1x3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A) + %tmp = getelementptr i64* %A, i64 3 + ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st1x3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) + + +define float* @test_v4f32_post_imm_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st1x3: +;CHECK: st1.4s { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A) + %tmp = getelementptr float* %A, i32 12 + ret float* %tmp +} + +define float* @test_v4f32_post_reg_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st1x3: +;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st1x3: +;CHECK: st1.2s { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A) + %tmp = getelementptr float* %A, i32 6 + ret float* %tmp +} + +define float* @test_v2f32_post_reg_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st1x3: +;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st1x3: +;CHECK: st1.2d { v0, v1, v2 }, [x0], #48 + call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A) + %tmp = getelementptr double* %A, i64 6 + ret double* %tmp +} + +define double* @test_v2f64_post_reg_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st1x3: +;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) + + +define double* @test_v1f64_post_imm_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st1x3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], #24 + call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A) + %tmp = getelementptr double* %A, i64 3 + ret double* %tmp +} + +define double* @test_v1f64_post_reg_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st1x3: +;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st1x4: +;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A) + %tmp = getelementptr i8* %A, i32 64 + ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st1x4: +;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) + + +define i8* @test_v8i8_post_imm_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st1x4: +;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A) + %tmp = getelementptr i8* %A, i32 32 + ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st1x4: +;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) + + +define i16* @test_v8i16_post_imm_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st1x4: +;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A) + %tmp = getelementptr i16* %A, i32 32 + ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st1x4: +;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) + + +define i16* @test_v4i16_post_imm_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st1x4: +;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A) + %tmp = getelementptr i16* %A, i32 16 + ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st1x4: +;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>, i16*) + + +define i32* @test_v4i32_post_imm_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st1x4: +;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A) + %tmp = getelementptr i32* %A, i32 16 + ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st1x4: +;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>, i32*) + + +define i32* @test_v2i32_post_imm_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st1x4: +;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A) + %tmp = getelementptr i32* %A, i32 8 + ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st1x4: +;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) + + +define i64* @test_v2i64_post_imm_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st1x4: +;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A) + %tmp = getelementptr i64* %A, i64 8 + ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st1x4: +;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>, i64*) + + +define i64* @test_v1i64_post_imm_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st1x4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A) + %tmp = getelementptr i64* %A, i64 4 + ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st1x4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>, i64*) + + +define float* @test_v4f32_post_imm_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st1x4: +;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A) + %tmp = getelementptr float* %A, i32 16 + ret float* %tmp +} + +define float* @test_v4f32_post_reg_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st1x4: +;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) + + +define float* @test_v2f32_post_imm_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st1x4: +;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A) + %tmp = getelementptr float* %A, i32 8 + ret float* %tmp +} + +define float* @test_v2f32_post_reg_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st1x4: +;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) + + +define double* @test_v2f64_post_imm_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st1x4: +;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64 + call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A) + %tmp = getelementptr double* %A, i64 8 + ret double* %tmp +} + +define double* @test_v2f64_post_reg_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st1x4: +;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>, double*) + + +define double* @test_v1f64_post_imm_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st1x4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32 + call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A) + %tmp = getelementptr double* %A, i64 4 + ret double* %tmp +} + +define double* @test_v1f64_post_reg_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st1x4: +;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) + + +define i8* @test_v16i8_post_imm_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) { + call void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A) + %tmp = getelementptr i8* %A, i32 2 + ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) { + call void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone + + +define i8* @test_v16i8_post_imm_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st2lane: +;CHECK: st2.b { v0, v1 }[0], [x0], #2 + call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 2 + ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st2lane: +;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) + + +define i8* @test_v8i8_post_imm_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st2lane: +;CHECK: st2.b { v0, v1 }[0], [x0], #2 + call void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 2 + ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st2lane: +;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) + + +define i16* @test_v8i16_post_imm_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st2lane: +;CHECK: st2.h { v0, v1 }[0], [x0], #4 + call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 2 + ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st2lane: +;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) + + +define i16* @test_v4i16_post_imm_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st2lane: +;CHECK: st2.h { v0, v1 }[0], [x0], #4 + call void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 2 + ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st2lane: +;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) + + +define i32* @test_v4i32_post_imm_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], #8 + call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 2 + ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) + + +define i32* @test_v2i32_post_imm_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], #8 + call void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 2 + ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) + + +define i64* @test_v2i64_post_imm_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], #16 + call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 2 + ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) + + +define i64* @test_v1i64_post_imm_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], #16 + call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 2 + ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) + + +define float* @test_v4f32_post_imm_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], #8 + call void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 2 + ret float* %tmp +} + +define float* @test_v4f32_post_reg_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) + + +define float* @test_v2f32_post_imm_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], #8 + call void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 2 + ret float* %tmp +} + +define float* @test_v2f32_post_reg_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st2lane: +;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) + + +define double* @test_v2f64_post_imm_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], #16 + call void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 2 + ret double* %tmp +} + +define double* @test_v2f64_post_reg_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) + + +define double* @test_v1f64_post_imm_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], #16 + call void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 2 + ret double* %tmp +} + +define double* @test_v1f64_post_reg_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st2lane: +;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) + + +define i8* @test_v16i8_post_imm_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st3lane: +;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3 + call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 3 + ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st3lane: +;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) + + +define i8* @test_v8i8_post_imm_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st3lane: +;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3 + call void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 3 + ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st3lane: +;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) + + +define i16* @test_v8i16_post_imm_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st3lane: +;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6 + call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 3 + ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st3lane: +;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) + + +define i16* @test_v4i16_post_imm_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st3lane: +;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6 + call void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 3 + ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st3lane: +;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) + + +define i32* @test_v4i32_post_imm_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12 + call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 3 + ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) + + +define i32* @test_v2i32_post_imm_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12 + call void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 3 + ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) + + +define i64* @test_v2i64_post_imm_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24 + call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 3 + ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) + + +define i64* @test_v1i64_post_imm_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24 + call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 3 + ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) + + +define float* @test_v4f32_post_imm_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12 + call void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 3 + ret float* %tmp +} + +define float* @test_v4f32_post_reg_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) + + +define float* @test_v2f32_post_imm_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12 + call void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 3 + ret float* %tmp +} + +define float* @test_v2f32_post_reg_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st3lane: +;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) + + +define double* @test_v2f64_post_imm_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24 + call void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 3 + ret double* %tmp +} + +define double* @test_v2f64_post_reg_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) + + +define double* @test_v1f64_post_imm_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24 + call void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 3 + ret double* %tmp +} + +define double* @test_v1f64_post_reg_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st3lane: +;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) + + +define i8* @test_v16i8_post_imm_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind { +;CHECK-LABEL: test_v16i8_post_imm_st4lane: +;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4 + call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 4 + ret i8* %tmp +} + +define i8* @test_v16i8_post_reg_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v16i8_post_reg_st4lane: +;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) + + +define i8* @test_v8i8_post_imm_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind { +;CHECK-LABEL: test_v8i8_post_imm_st4lane: +;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4 + call void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i32 4 + ret i8* %tmp +} + +define i8* @test_v8i8_post_reg_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i8_post_reg_st4lane: +;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A) + %tmp = getelementptr i8* %A, i64 %inc + ret i8* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) + + +define i16* @test_v8i16_post_imm_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind { +;CHECK-LABEL: test_v8i16_post_imm_st4lane: +;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8 + call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 4 + ret i16* %tmp +} + +define i16* @test_v8i16_post_reg_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v8i16_post_reg_st4lane: +;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) + + +define i16* @test_v4i16_post_imm_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind { +;CHECK-LABEL: test_v4i16_post_imm_st4lane: +;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8 + call void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i32 4 + ret i16* %tmp +} + +define i16* @test_v4i16_post_reg_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i16_post_reg_st4lane: +;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A) + %tmp = getelementptr i16* %A, i64 %inc + ret i16* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) + + +define i32* @test_v4i32_post_imm_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind { +;CHECK-LABEL: test_v4i32_post_imm_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16 + call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 4 + ret i32* %tmp +} + +define i32* @test_v4i32_post_reg_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4i32_post_reg_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) + + +define i32* @test_v2i32_post_imm_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind { +;CHECK-LABEL: test_v2i32_post_imm_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16 + call void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i32 4 + ret i32* %tmp +} + +define i32* @test_v2i32_post_reg_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i32_post_reg_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A) + %tmp = getelementptr i32* %A, i64 %inc + ret i32* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) + + +define i64* @test_v2i64_post_imm_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind { +;CHECK-LABEL: test_v2i64_post_imm_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32 + call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 4 + ret i64* %tmp +} + +define i64* @test_v2i64_post_reg_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2i64_post_reg_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) + + +define i64* @test_v1i64_post_imm_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind { +;CHECK-LABEL: test_v1i64_post_imm_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32 + call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 4 + ret i64* %tmp +} + +define i64* @test_v1i64_post_reg_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1i64_post_reg_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A) + %tmp = getelementptr i64* %A, i64 %inc + ret i64* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) + + +define float* @test_v4f32_post_imm_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind { +;CHECK-LABEL: test_v4f32_post_imm_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16 + call void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 4 + ret float* %tmp +} + +define float* @test_v4f32_post_reg_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v4f32_post_reg_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) + + +define float* @test_v2f32_post_imm_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind { +;CHECK-LABEL: test_v2f32_post_imm_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16 + call void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A) + %tmp = getelementptr float* %A, i32 4 + ret float* %tmp +} + +define float* @test_v2f32_post_reg_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f32_post_reg_st4lane: +;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A) + %tmp = getelementptr float* %A, i64 %inc + ret float* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) + + +define double* @test_v2f64_post_imm_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind { +;CHECK-LABEL: test_v2f64_post_imm_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32 + call void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 4 + ret double* %tmp +} + +define double* @test_v2f64_post_reg_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v2f64_post_reg_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) + + +define double* @test_v1f64_post_imm_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind { +;CHECK-LABEL: test_v1f64_post_imm_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32 + call void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 4 + ret double* %tmp +} + +define double* @test_v1f64_post_reg_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind { +;CHECK-LABEL: test_v1f64_post_reg_st4lane: +;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}} + call void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A) + %tmp = getelementptr double* %A, i64 %inc + ret double* %tmp +} + +declare void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) \ No newline at end of file -- 2.34.1