From 6c822eea47dbef96940819b1ea085fabc49a1e71 Mon Sep 17 00:00:00 2001 From: James Molloy Date: Thu, 6 Sep 2012 09:16:01 +0000 Subject: [PATCH] Optimize codegen for VSETLNi{8,16,32} operating on Q registers. Degenerate to a VSETLN on D registers, instead of an (INSERT_SUBREG (VSETLN (EXTRACT_SUBREG ))) sequence to help the register coalescer. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@163298 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMExpandPseudoInsts.cpp | 51 +++++++++++++++++++++++ lib/Target/ARM/ARMInstrNEON.td | 32 +++++++------- test/CodeGen/ARM/integer_insertelement.ll | 35 ++++++++++++++++ test/CodeGen/ARM/vget_lane.ll | 2 +- 4 files changed, 102 insertions(+), 18 deletions(-) create mode 100644 test/CodeGen/ARM/integer_insertelement.ll diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 15bb32eb149..8ed6b751f3e 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1208,6 +1208,57 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, ExpandLaneOp(MBBI); return true; + case ARM::VSETLNi8Q: + case ARM::VSETLNi16Q: { + // Expand VSETLNs acting on a Q register to equivalent VSETLNs acting + // on the respective D register. + + unsigned QReg = MI.getOperand(1).getReg(); + unsigned QLane = MI.getOperand(3).getImm(); + + unsigned NewOpcode, DLane, DSubReg; + switch (Opcode) { + default: llvm_unreachable("Invalid opcode!"); + case ARM::VSETLNi8Q: + // 4 possible 8-bit lanes per DPR: + NewOpcode = ARM::VSETLNi8; + DLane = QLane % 8; + DSubReg = (QLane / 8) ? ARM::dsub_1 : ARM::dsub_0; + break; + case ARM::VSETLNi16Q: + // 4 possible 16-bit lanes per DPR. + NewOpcode = ARM::VSETLNi16; + DLane = QLane % 4; + DSubReg = (QLane / 4) ? ARM::dsub_1 : ARM::dsub_0; + break; + } + + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpcode)); + + unsigned DReg = TRI->getSubReg(QReg, DSubReg); + + MIB.addReg(DReg, RegState::Define); // Output DPR + MIB.addReg(DReg); // Input DPR + MIB.addOperand(MI.getOperand(2)); // Input GPR + MIB.addImm(DLane); // Lane + + // Add the predicate operands. + MIB.addOperand(MI.getOperand(4)); + MIB.addOperand(MI.getOperand(5)); + + if (MI.getOperand(1).isKill()) // Add an implicit kill for the Q register. + MIB->addRegisterKilled(QReg, TRI, true); + // And an implicit def of the output register (which should always be the + // same as the input register). + MIB->addRegisterDefined(QReg, TRI); + + TransferImpOps(MI, MIB, MIB); + + MI.eraseFromParent(); + return true; + } + case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true; case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true; case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true; diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 048d340df00..c5f3a83ab24 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -5045,25 +5045,23 @@ def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V), GPR:$R, imm:$lane))]> { let Inst{21} = lane{0}; } + +def VSETLNi8Q : PseudoNeonI<(outs QPR:$V), + (ins QPR:$src1, GPR:$R, VectorIndex8:$lane), + IIC_VMOVISL, "", + [(set QPR:$V, (vector_insert (v16i8 QPR:$src1), + GPR:$R, imm:$lane))]>; +def VSETLNi16Q : PseudoNeonI<(outs QPR:$V), + (ins QPR:$src1, GPR:$R, VectorIndex16:$lane), + IIC_VMOVISL, "", + [(set QPR:$V, (vector_insert (v8i16 QPR:$src1), + GPR:$R, imm:$lane))]>; } -def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane), - (v16i8 (INSERT_SUBREG QPR:$src1, - (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1, - (DSubReg_i8_reg imm:$lane))), - GPR:$src2, (SubReg_i8_lane imm:$lane))), - (DSubReg_i8_reg imm:$lane)))>; -def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane), - (v8i16 (INSERT_SUBREG QPR:$src1, - (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, - (DSubReg_i16_reg imm:$lane))), - GPR:$src2, (SubReg_i16_lane imm:$lane))), - (DSubReg_i16_reg imm:$lane)))>; + def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane), - (v4i32 (INSERT_SUBREG QPR:$src1, - (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1, - (DSubReg_i32_reg imm:$lane))), - GPR:$src2, (SubReg_i32_lane imm:$lane))), - (DSubReg_i32_reg imm:$lane)))>; + (v4i32 (INSERT_SUBREG QPR:$src1, + GPR:$src2, + (SSubReg_f32_reg imm:$lane)))>; def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)), (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)), diff --git a/test/CodeGen/ARM/integer_insertelement.ll b/test/CodeGen/ARM/integer_insertelement.ll new file mode 100644 index 00000000000..4f2d7e3f73e --- /dev/null +++ b/test/CodeGen/ARM/integer_insertelement.ll @@ -0,0 +1,35 @@ +; RUN: llc %s -o - -march=arm -mattr=+neon | FileCheck %s + +; This test checks that when inserting one (integer) element into a vector, +; the vector is not spuriously copied. "vorr dX, dY, dY" is the way of moving +; one DPR to another that we check for. + +; CHECK: @f +; CHECK-NOT: vorr d +; CHECK: vmov s +; CHECK-NOT: vorr d +; CHECK: mov pc, lr +define <4 x i32> @f(<4 x i32> %in) { + %1 = insertelement <4 x i32> %in, i32 255, i32 3 + ret <4 x i32> %1 +} + +; CHECK: @g +; CHECK-NOT: vorr d +; CHECK: vmov.16 d +; CHECK-NOT: vorr d +; CHECK: mov pc, lr +define <8 x i16> @g(<8 x i16> %in) { + %1 = insertelement <8 x i16> %in, i16 255, i32 7 + ret <8 x i16> %1 +} + +; CHECK: @h +; CHECK-NOT: vorr d +; CHECK: vmov.8 d +; CHECK-NOT: vorr d +; CHECK: mov pc, lr +define <16 x i8> @h(<16 x i8> %in) { + %1 = insertelement <16 x i8> %in, i8 255, i32 15 + ret <16 x i8> %1 +} diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll index 1fc885d6137..2ed65c9aeed 100644 --- a/test/CodeGen/ARM/vget_lane.ll +++ b/test/CodeGen/ARM/vget_lane.ll @@ -200,7 +200,7 @@ define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind { define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind { ;CHECK: vsetQ_lane32: -;CHECK: vmov.32 +;CHECK: vmov s %tmp1 = load <4 x i32>* %A %tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1 ret <4 x i32> %tmp2 -- 2.34.1