From: Evan Cheng Date: Mon, 17 May 2010 20:57:12 +0000 (+0000) Subject: Careful with reg_sequence coalescing to not to overwrite sub-register indices. X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=53c779bb3a5712b67cf6bae785cb83a90841d17e;p=oota-llvm.git Careful with reg_sequence coalescing to not to overwrite sub-register indices. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@103971 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 80bb1a9e9d0..fb9bddca25c 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -128,6 +128,8 @@ namespace { void ProcessCopy(MachineInstr *MI, MachineBasicBlock *MBB, SmallPtrSet &Processed); + void CoalesceExtSubRegs(SmallVector &Srcs, unsigned DstReg); + /// EliminateRegSequences - Eliminate REG_SEQUENCE instructions as part /// of the de-ssa process. This replaces sources of REG_SEQUENCE as /// sub-register references of the register defined by REG_SEQUENCE. @@ -1132,7 +1134,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { } static void UpdateRegSequenceSrcs(unsigned SrcReg, - unsigned DstReg, unsigned SrcIdx, + unsigned DstReg, unsigned SubIdx, MachineRegisterInfo *MRI) { for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg), RE = MRI->reg_end(); RI != RE; ) { @@ -1140,7 +1142,77 @@ static void UpdateRegSequenceSrcs(unsigned SrcReg, ++RI; MO.setReg(DstReg); assert(MO.getSubReg() == 0); - MO.setSubReg(SrcIdx); + MO.setSubReg(SubIdx); + } +} + +/// CoalesceExtSubRegs - If a number of sources of the REG_SEQUENCE are +/// EXTRACT_SUBREG from the same register and to the same virtual register +/// with different sub-register indices, attempt to combine the +/// EXTRACT_SUBREGs and pre-coalesce them. e.g. +/// %reg1026 = VLDMQ %reg1025, 260, pred:14, pred:%reg0 +/// %reg1029:6 = EXTRACT_SUBREG %reg1026, 6 +/// %reg1029:5 = EXTRACT_SUBREG %reg1026, 5 +/// Since D subregs 5, 6 can combine to a Q register, we can coalesce +/// reg1026 to reg1029. +void +TwoAddressInstructionPass::CoalesceExtSubRegs(SmallVector &Srcs, + unsigned DstReg) { + SmallSet Seen; + for (unsigned i = 0, e = Srcs.size(); i != e; ++i) { + unsigned SrcReg = Srcs[i]; + if (!Seen.insert(SrcReg)) + continue; + + // If there are no other uses than extract_subreg which feed into + // the reg_sequence, then we might be able to coalesce them. + bool CanCoalesce = true; + SmallVector SubIndices; + for (MachineRegisterInfo::use_nodbg_iterator + UI = MRI->use_nodbg_begin(SrcReg), + UE = MRI->use_nodbg_end(); UI != UE; ++UI) { + MachineInstr *UseMI = &*UI; + if (!UseMI->isExtractSubreg() || + UseMI->getOperand(0).getReg() != DstReg) { + CanCoalesce = false; + break; + } + SubIndices.push_back(UseMI->getOperand(2).getImm()); + } + + if (!CanCoalesce || SubIndices.size() < 2) + continue; + + std::sort(SubIndices.begin(), SubIndices.end()); + unsigned NewSubIdx = 0; + if (TRI->canCombinedSubRegIndex(MRI->getRegClass(SrcReg), SubIndices, + NewSubIdx)) { + bool Proceed = true; + if (NewSubIdx) + for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg), + RE = MRI->reg_end(); RI != RE; ) { + MachineOperand &MO = RI.getOperand(); + ++RI; + // FIXME: If the sub-registers do not combine to the whole + // super-register, i.e. NewSubIdx != 0, and any of the use has a + // sub-register index, then abort the coalescing attempt. + if (MO.getSubReg()) { + Proceed = false; + break; + } + MO.setReg(DstReg); + MO.setSubReg(NewSubIdx); + } + if (Proceed) + for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg), + RE = MRI->reg_end(); RI != RE; ) { + MachineOperand &MO = RI.getOperand(); + ++RI; + MO.setReg(DstReg); + if (NewSubIdx) + MO.setSubReg(NewSubIdx); + } + } } } @@ -1221,50 +1293,15 @@ bool TwoAddressInstructionPass::EliminateRegSequences() { for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) { unsigned SrcReg = MI->getOperand(i).getReg(); - unsigned SrcIdx = MI->getOperand(i+1).getImm(); - UpdateRegSequenceSrcs(SrcReg, DstReg, SrcIdx, MRI); + unsigned SubIdx = MI->getOperand(i+1).getImm(); + UpdateRegSequenceSrcs(SrcReg, DstReg, SubIdx, MRI); } DEBUG(dbgs() << "Eliminated: " << *MI); MI->eraseFromParent(); // Try coalescing some EXTRACT_SUBREG instructions. - Seen.clear(); - for (unsigned i = 0, e = RealSrcs.size(); i != e; ++i) { - unsigned SrcReg = RealSrcs[i]; - if (!Seen.insert(SrcReg)) - continue; - - // If there are no other uses than extract_subreg which feed into - // the reg_sequence, then we might be able to coalesce them. - bool CanCoalesce = true; - SmallVector SubIndices; - for (MachineRegisterInfo::use_nodbg_iterator - UI = MRI->use_nodbg_begin(SrcReg), - UE = MRI->use_nodbg_end(); UI != UE; ++UI) { - MachineInstr *UseMI = &*UI; - if (!UseMI->isExtractSubreg() || - UseMI->getOperand(0).getReg() != DstReg) { - CanCoalesce = false; - break; - } - SubIndices.push_back(UseMI->getOperand(2).getImm()); - } - - if (!CanCoalesce) - continue; - - // %reg1026 = VLDMQ %reg1025, 260, pred:14, pred:%reg0 - // %reg1029:6 = EXTRACT_SUBREG %reg1026, 6 - // %reg1029:5 = EXTRACT_SUBREG %reg1026, 5 - // Since D subregs 5, 6 can combine to a Q register, we can coalesce - // reg1026 to reg1029. - std::sort(SubIndices.begin(), SubIndices.end()); - unsigned NewSubIdx = 0; - if (TRI->canCombinedSubRegIndex(MRI->getRegClass(SrcReg), SubIndices, - NewSubIdx)) - UpdateRegSequenceSrcs(SrcReg, DstReg, NewSubIdx, MRI); - } + CoalesceExtSubRegs(RealSrcs, DstReg); } RegSequences.clear(); diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll index 5a5792b55ae..5bfabefa8eb 100644 --- a/test/CodeGen/ARM/reg_sequence.ll +++ b/test/CodeGen/ARM/reg_sequence.ll @@ -3,6 +3,7 @@ %struct.int16x8_t = type { <8 x i16> } %struct.int32x4_t = type { <4 x i32> } +%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> } %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> } @@ -149,12 +150,51 @@ define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind { ret <8 x i16> %tmp5 } +define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind { +; CHECK: t6: +; CHECK: vldr.64 +; CHECK: vmov d1, d0 +; CHECK-NEXT: vld2.8 {d0[1], d1[1]} + %tmp1 = load <8 x i8>* %B ; <<8 x i8>> [#uses=2] + %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2] + %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1] + %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1] + %tmp5 = add <8 x i8> %tmp3, %tmp4 ; <<8 x i8>> [#uses=1] + ret <8 x i8> %tmp5 +} + +define arm_apcscc void @t7(i32* %iptr, i32* %optr) nounwind { +entry: +; CHECK: t7: +; CHECK: vld2.32 +; CHECK: vst2.32 +; CHECK: vld1.32 {d0, d1}, +; CHECK: vmov q1, q0 +; CHECK-NOT: vmov +; CHECK: vuzp.32 q0, q1 +; CHECK: vst1.32 + %0 = bitcast i32* %iptr to i8* ; [#uses=2] + %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0) ; <%struct.__neon_int32x4x2_t> [#uses=2] + %tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1] + %tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1] + %2 = bitcast i32* %optr to i8* ; [#uses=2] + tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60) + %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0) ; <<4 x i32>> [#uses=1] + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> ; <<4 x i32>> [#uses=1] + tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4) + ret void +} + +declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly + declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind + declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind @@ -163,6 +203,8 @@ declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonl declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*) nounwind readonly +declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind readonly + declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind readonly declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>) nounwind