From: Bob Wilson Date: Mon, 1 Nov 2010 22:04:05 +0000 (+0000) Subject: Add NEON VLD1-lane instructions. Partial fix for Radar 8599955. X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=b796bbb6de19872c0c1921b8b3f05206dd33c97d;p=oota-llvm.git Add NEON VLD1-lane instructions. Partial fix for Radar 8599955. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@117964 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 53d2e9df126..774324b4528 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -110,6 +110,13 @@ namespace { } static const NEONLdStTableEntry NEONLdStTable[] = { +{ ARM::VLD1LNq16Pseudo, ARM::VLD1LNd16, true, false, EvenDblSpc, 1, 4 }, +{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, false, EvenDblSpc, 1, 4 }, +{ ARM::VLD1LNq32Pseudo, ARM::VLD1LNd32, true, false, EvenDblSpc, 1, 2 }, +{ ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, false, EvenDblSpc, 1, 2 }, +{ ARM::VLD1LNq8Pseudo, ARM::VLD1LNd8, true, false, EvenDblSpc, 1, 8 }, +{ ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, false, EvenDblSpc, 1, 8 }, + { ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, SingleSpc, 4, 1 }, { ARM::VLD1d64QPseudo_UPD, ARM::VLD1d64Q_UPD, true, true, SingleSpc, 4, 1 }, { ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, SingleSpc, 3, 1 }, @@ -476,8 +483,9 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) { DstIsDead = MI.getOperand(OpIdx).isDead(); DstReg = MI.getOperand(OpIdx++).getReg(); GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); - MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 1) + MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); if (NumRegs > 2) MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); if (NumRegs > 3) @@ -502,7 +510,9 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) { // Add the subregs as sources of the new instruction. unsigned SrcFlags = (getUndefRegState(MO.isUndef()) | getKillRegState(MO.isKill())); - MIB.addReg(D0, SrcFlags).addReg(D1, SrcFlags); + MIB.addReg(D0, SrcFlags); + if (NumRegs > 1) + MIB.addReg(D1, SrcFlags); if (NumRegs > 2) MIB.addReg(D2, SrcFlags); if (NumRegs > 3) @@ -943,6 +953,12 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { ExpandVST(MBBI); break; + case ARM::VLD1LNq8Pseudo: + case ARM::VLD1LNq16Pseudo: + case ARM::VLD1LNq32Pseudo: + case ARM::VLD1LNq8Pseudo_UPD: + case ARM::VLD1LNq16Pseudo_UPD: + case ARM::VLD1LNq32Pseudo_UPD: case ARM::VLD2LNd8Pseudo: case ARM::VLD2LNd16Pseudo: case ARM::VLD2LNd32Pseudo: diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index eadc907529d..ed14814dde8 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -421,6 +421,8 @@ def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo; def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo; def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo; +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 + // Classes for VLD*LN pseudo-instructions with multi-register operands. // These are expanded to real instructions after register allocation. class VLDQLNPseudo @@ -449,7 +451,46 @@ class VLDQQQQLNWBPseudo nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; // VLD1LN : Vector Load (single element to one lane) -// FIXME: Not yet implemented. +class VLD1LN op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag LoadOp> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst), + (ins addrmode6:$addr, DPR:$src, nohash_imm:$lane), + IIC_VLD1ln, "vld1", Dt, "\\{$dst[$lane]\\}, $addr", + "$src = $dst", + [(set DPR:$dst, (vector_insert (Ty DPR:$src), + (i32 (LoadOp addrmode6:$addr)), + imm:$lane))]>; +class VLD1QLNPseudo : VLDQLNPseudo { + let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src), + (i32 (LoadOp addrmode6:$addr)), + imm:$lane))]; +} + +def VLD1LNd8 : VLD1LN<0b0000, {?,?,?,0}, "8", v8i8, extloadi8>; +def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16>; +def VLD1LNd32 : VLD1LN<0b1000, {?,0,?,?}, "32", v2i32, load>; + +def VLD1LNq8Pseudo : VLD1QLNPseudo; +def VLD1LNq16Pseudo : VLD1QLNPseudo; +def VLD1LNq32Pseudo : VLD1QLNPseudo; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { + +// ...with address register writeback: +class VLD1LNWB op11_8, bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt, + "\\{$dst[$lane]\\}, $addr$offset", + "$src = $dst, $addr.addr = $wb", []>; + +def VLD1LNd8_UPD : VLD1LNWB<0b0000, {?,?,?,0}, "8">; +def VLD1LNd16_UPD : VLD1LNWB<0b0100, {?,?,0,?}, "16">; +def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32">; + +def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo; +def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo; +def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo; // VLD2LN : Vector Load (single 2-element structure to one lane) class VLD2LN op11_8, bits<4> op7_4, string Dt> diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 61489d64f37..7f4d2bbd44a 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -134,6 +134,8 @@ def IIC_VLD1u : InstrItinClass; def IIC_VLD1x2u : InstrItinClass; def IIC_VLD1x3u : InstrItinClass; def IIC_VLD1x4u : InstrItinClass; +def IIC_VLD1ln : InstrItinClass; +def IIC_VLD1lnu : InstrItinClass; def IIC_VLD2 : InstrItinClass; def IIC_VLD2x2 : InstrItinClass; def IIC_VLD2u : InstrItinClass; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index 1f19b21e1c3..b5c8def42f9 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -457,6 +457,18 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<3, [A8_LSPipe]>], [2, 2, 3, 3, 2, 1]>, // + // VLD1ln + InstrItinData, + InstrStage<3, [A8_NLSPipe], 1>, + InstrStage<3, [A8_LSPipe]>], + [3, 1, 1, 1]>, + // + // VLD1lnu + InstrItinData, + InstrStage<3, [A8_NLSPipe], 1>, + InstrStage<3, [A8_LSPipe]>], + [3, 2, 1, 1, 1, 1]>, + // // VLD2 InstrItinData, InstrStage<2, [A8_NLSPipe], 1>, diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 3096b0ad990..c78f59383f2 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -787,6 +787,24 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<3, [A9_LSUnit]>], [2, 2, 3, 3, 2, 1]>, // + // VLD1ln + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 1>, + InstrStage<3, [A9_LSUnit]>], + [4, 1, 1, 1]>, + // + // VLD1lnu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 1>, + InstrStage<3, [A9_LSUnit]>], + [4, 2, 1, 1, 1, 1]>, + // // VLD2 InstrItinData, InstrStage<1, [A9_MUX0], 0>, diff --git a/test/CodeGen/ARM/vldlane.ll b/test/CodeGen/ARM/vldlane.ll index 3a7bafe4d65..57e74dfe249 100644 --- a/test/CodeGen/ARM/vldlane.ll +++ b/test/CodeGen/ARM/vldlane.ll @@ -1,5 +1,32 @@ ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s +define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind { +;CHECK: vld1lanei8: +;CHECK: vld1.8 {d16[3]}, [r0] + %tmp1 = load <8 x i8>* %B + %tmp2 = load i8* %A, align 1 + %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind { +;CHECK: vld1lanei16: +;CHECK: vld1.16 {d16[2]}, [r0] + %tmp1 = load <4 x i16>* %B + %tmp2 = load i16* %A, align 2 + %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind { +;CHECK: vld1lanei32: +;CHECK: vld1.32 {d16[1]}, [r0] + %tmp1 = load <2 x i32>* %B + %tmp2 = load i32* %A, align 4 + %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 + ret <2 x i32> %tmp3 +} + %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> } %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }