From 5ad5f5931e34d5be410b1e901640bc1c2d308612 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Tue, 19 Mar 2013 08:15:38 +0000 Subject: [PATCH] Improve long vector sext/zext lowering on ARM The ARM backend currently has poor codegen for long sext/zext operations, such as v8i8 -> v8i32. This patch addresses this by performing a custom expansion in ARMISelLowering. It also adds/changes the cost of such lowering in ARMTTI. This partially addresses PR14867. Patch by Pete Couperus git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@177380 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelLowering.cpp | 55 ++++++++++ lib/Target/ARM/ARMTargetTransformInfo.cpp | 16 ++- test/Analysis/CostModel/ARM/cast.ll | 24 ++++- test/CodeGen/ARM/vcvt.ll | 123 +++++++++++++--------- 4 files changed, 157 insertions(+), 61 deletions(-) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 514971f01ee..40d2e8d2657 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -564,6 +564,16 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); + // Custom expand long extensions to vectors. + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + // NEON does not have single instruction CTPOP for vectors with element // types wider than 8-bits. However, custom lowering can leverage the // v8i8/v16i8 vcnt instruction. @@ -3433,6 +3443,47 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { return FrameAddr; } +/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec), +/// and size(DestVec) > 128-bits. +/// This is achieved by doing the one extension from the SrcVec, splitting the +/// result, extending these parts, and then concatenating these into the +/// destination. +static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) { + SDValue Op = N->getOperand(0); + EVT SrcVT = Op.getValueType(); + EVT DestVT = N->getValueType(0); + + assert(DestVT.getSizeInBits() > 128 && + "Custom sext/zext expansion needs >128-bit vector."); + // If this is a normal length extension, use the default expansion. + if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() && + SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits()) + return SDValue(); + + DebugLoc dl = N->getDebugLoc(); + unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); + unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits(); + unsigned NumElts = SrcVT.getVectorNumElements(); + LLVMContext &Ctx = *DAG.getContext(); + SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi; + + EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2), + NumElts); + EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2), + NumElts/2); + EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize), + NumElts/2); + + Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op); + SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid, + DAG.getIntPtrConstant(0)); + SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid, + DAG.getIntPtrConstant(NumElts/2)); + ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo); + ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi); +} + /// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 @@ -5621,6 +5672,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::BITCAST: Res = ExpandBITCAST(N, DAG); break; + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + Res = ExpandVectorExtension(N, DAG); + break; case ISD::SRL: case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 140a8db1697..1019b972e95 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -211,11 +211,19 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + // The number of vmovl instructions for the extension. + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + // Operations that we legalize using load/stores to the stack. - { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 16*2 + 4*4 }, - { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 16*2 + 4*3 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 8*2 + 2*4 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 8*2 + 2*3 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 }, diff --git a/test/Analysis/CostModel/ARM/cast.ll b/test/Analysis/CostModel/ARM/cast.ll index 96eb33564f4..ba9d84cf3e2 100644 --- a/test/Analysis/CostModel/ARM/cast.ll +++ b/test/Analysis/CostModel/ARM/cast.ll @@ -152,15 +152,29 @@ define i32 @casts() { ; CHECK: cost of 10 {{.*}} uitofp %r69 = uitofp i64 undef to double - ; Vector cast cost of instructions lowering the cast to the stack. - ; CHECK: cost of 24 {{.*}} sext + ; CHECK: cost of 3 {{.*}} sext %r70 = sext <8 x i8> undef to <8 x i32> - ; CHECK: cost of 48 {{.*}} sext + ; CHECK: cost of 6 {{.*}} sext %r71 = sext <16 x i8> undef to <16 x i32> - ; CHECK: cost of 22 {{.*}} zext + ; CHECK: cost of 3 {{.*}} zext %r72 = zext <8 x i8> undef to <8 x i32> - ; CHECK: cost of 44 {{.*}} zext + ; CHECK: cost of 6 {{.*}} zext %r73 = zext <16 x i8> undef to <16 x i32> + + ; CHECK: cost of 7 {{.*}} sext + %rext_0 = sext <8 x i8> undef to <8 x i64> + ; CHECK: cost of 7 {{.*}} zext + %rext_1 = zext <8 x i8> undef to <8 x i64> + ; CHECK: cost of 6 {{.*}} sext + %rext_2 = sext <8 x i16> undef to <8 x i64> + ; CHECK: cost of 6 {{.*}} zext + %rext_3 = zext <8 x i16> undef to <8 x i64> + ; CHECK: cost of 3 {{.*}} sext + %rext_4 = sext <4 x i16> undef to <4 x i64> + ; CHECK: cost of 3 {{.*}} zext + %rext_5 = zext <4 x i16> undef to <4 x i64> + + ; Vector cast cost of instructions lowering the cast to the stack. ; CHECK: cost of 19 {{.*}} trunc %r74 = trunc <8 x i32> undef to <8 x i8> ; CHECK: cost of 38 {{.*}} trunc diff --git a/test/CodeGen/ARM/vcvt.ll b/test/CodeGen/ARM/vcvt.ll index 72d36456fd2..e67b4788a37 100644 --- a/test/CodeGen/ARM/vcvt.ll +++ b/test/CodeGen/ARM/vcvt.ll @@ -165,17 +165,12 @@ declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone %T1_5 = type <8 x i32> ; CHECK: func_cvt5: define void @func_cvt5(%T0_5* %loadaddr, %T1_5* %storeaddr) { -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh +; CHECK: vmovl.s8 +; CHECK: vmovl.s16 +; CHECK: vmovl.s16 %v0 = load %T0_5* %loadaddr ; COST: func_cvt5 -; COST: cost of 24 {{.*}} sext +; COST: cost of 3 {{.*}} sext %r = sext %T0_5 %v0 to %T1_5 store %T1_5 %r, %T1_5* %storeaddr ret void @@ -186,17 +181,12 @@ define void @func_cvt5(%T0_5* %loadaddr, %T1_5* %storeaddr) { %TA1_5 = type <8 x i32> ; CHECK: func_cvt1: define void @func_cvt1(%TA0_5* %loadaddr, %TA1_5* %storeaddr) { -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh +; CHECK: vmovl.u8 +; CHECK: vmovl.u16 +; CHECK: vmovl.u16 %v0 = load %TA0_5* %loadaddr ; COST: func_cvt1 -; COST: cost of 22 {{.*}} zext +; COST: cost of 3 {{.*}} zext %r = zext %TA0_5 %v0 to %TA1_5 store %TA1_5 %r, %TA1_5* %storeaddr ret void @@ -228,25 +218,13 @@ define void @func_cvt51(%T0_51* %loadaddr, %T1_51* %storeaddr) { %TT1_5 = type <16 x i32> ; CHECK: func_cvt52: define void @func_cvt52(%TT0_5* %loadaddr, %TT1_5* %storeaddr) { -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh +; CHECK: vmovl.s16 +; CHECK: vmovl.s16 +; CHECK: vmovl.s16 +; CHECK: vmovl.s16 %v0 = load %TT0_5* %loadaddr ; COST: func_cvt52 -; COST: cost of 48 {{.*}} sext +; COST: cost of 6 {{.*}} sext %r = sext %TT0_5 %v0 to %TT1_5 store %TT1_5 %r, %TT1_5* %storeaddr ret void @@ -257,25 +235,13 @@ define void @func_cvt52(%TT0_5* %loadaddr, %TT1_5* %storeaddr) { %TTA1_5 = type <16 x i32> ; CHECK: func_cvt12: define void @func_cvt12(%TTA0_5* %loadaddr, %TTA1_5* %storeaddr) { -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh +; CHECK: vmovl.u16 +; CHECK: vmovl.u16 +; CHECK: vmovl.u16 +; CHECK: vmovl.u16 %v0 = load %TTA0_5* %loadaddr ; COST: func_cvt12 -; COST: cost of 44 {{.*}} zext +; COST: cost of 6 {{.*}} zext %r = zext %TTA0_5 %v0 to %TTA1_5 store %TTA1_5 %r, %TTA1_5* %storeaddr ret void @@ -309,3 +275,56 @@ define void @func_cvt512(%TT0_51* %loadaddr, %TT1_51* %storeaddr) { store %TT1_51 %r, %TT1_51* %storeaddr ret void } + +; CHECK: sext_v4i16_v4i64: +define void @sext_v4i16_v4i64(<4 x i16>* %loadaddr, <4 x i64>* %storeaddr) { +; CHECK: vmovl.s32 +; CHECK: vmovl.s32 + %v0 = load <4 x i16>* %loadaddr +; COST: sext_v4i16_v4i64 +; COST: cost of 3 {{.*}} sext + %r = sext <4 x i16> %v0 to <4 x i64> + store <4 x i64> %r, <4 x i64>* %storeaddr + ret void +} + +; CHECK: zext_v4i16_v4i64: +define void @zext_v4i16_v4i64(<4 x i16>* %loadaddr, <4 x i64>* %storeaddr) { +; CHECK: vmovl.u32 +; CHECK: vmovl.u32 + %v0 = load <4 x i16>* %loadaddr +; COST: zext_v4i16_v4i64 +; COST: cost of 3 {{.*}} zext + %r = zext <4 x i16> %v0 to <4 x i64> + store <4 x i64> %r, <4 x i64>* %storeaddr + ret void +} + +; CHECK: sext_v8i16_v8i64: +define void @sext_v8i16_v8i64(<8 x i16>* %loadaddr, <8 x i64>* %storeaddr) { +; CHECK: vmovl.s32 +; CHECK: vmovl.s32 +; CHECK: vmovl.s32 +; CHECK: vmovl.s32 + %v0 = load <8 x i16>* %loadaddr +; COST: sext_v8i16_v8i64 +; COST: cost of 6 {{.*}} sext + %r = sext <8 x i16> %v0 to <8 x i64> + store <8 x i64> %r, <8 x i64>* %storeaddr + ret void +} + +; CHECK: zext_v8i16_v8i64: +define void @zext_v8i16_v8i64(<8 x i16>* %loadaddr, <8 x i64>* %storeaddr) { +; CHECK: vmovl.u32 +; CHECK: vmovl.u32 +; CHECK: vmovl.u32 +; CHECK: vmovl.u32 + %v0 = load <8 x i16>* %loadaddr +; COST: zext_v8i16_v8i64 +; COST: cost of 6 {{.*}} zext + %r = zext <8 x i16> %v0 to <8 x i64> + store <8 x i64> %r, <8 x i64>* %storeaddr + ret void +} + -- 2.34.1