From eb0c3d372906df9c61a31651a0ba278034447e94 Mon Sep 17 00:00:00 2001 From: Bob Wilson Date: Fri, 3 Sep 2010 01:35:08 +0000 Subject: [PATCH] Replace NEON vabdl, vaba, and vabal intrinsics with combinations of the vabd intrinsic and add and/or zext operations. In the case of vaba, this also avoids the need for a DAG combine pattern to combine vabd with add. Update tests. Auto-upgrade the old intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112941 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IntrinsicsARM.td | 8 -- lib/Target/ARM/ARMISelLowering.cpp | 17 ---- lib/Target/ARM/ARMInstrNEON.td | 127 +++++++++++++++++++++--- lib/VMCore/AutoUpgrade.cpp | 62 +++++++++--- test/Bitcode/neon-intrinsics.ll | 38 ++++++++ test/Bitcode/neon-intrinsics.ll.bc | Bin 5016 -> 5764 bytes test/CodeGen/ARM/vaba.ll | 152 ++++++++++++++--------------- test/CodeGen/ARM/vabd.ll | 38 ++++---- 8 files changed, 289 insertions(+), 153 deletions(-) diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td index f14d5378bdd..6c047718e6f 100644 --- a/include/llvm/IntrinsicsARM.td +++ b/include/llvm/IntrinsicsARM.td @@ -176,14 +176,6 @@ let TargetPrefix = "arm" in { // Vector Absolute Differences. def int_arm_neon_vabds : Neon_2Arg_Intrinsic; def int_arm_neon_vabdu : Neon_2Arg_Intrinsic; -def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic; -def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic; - -// Vector Absolute Difference and Accumulate. -def int_arm_neon_vabas : Neon_3Arg_Intrinsic; -def int_arm_neon_vabau : Neon_3Arg_Intrinsic; -def int_arm_neon_vabals : Neon_3Arg_Long_Intrinsic; -def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic; // Vector Pairwise Add. def int_arm_neon_vpadd : Neon_2Arg_Intrinsic; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 0a41d6d9574..ce4a2c90689 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -4293,28 +4293,11 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, /// operands. static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI) { - SelectionDAG &DAG = DCI.DAG; - // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) { SDValue Result = combineSelectAndUse(N, N0, N1, DCI); if (Result.getNode()) return Result; } - - // fold (add (arm_neon_vabd a, b) c) -> (arm_neon_vaba c, a, b) - EVT VT = N->getValueType(0); - if (N0.getOpcode() == ISD::INTRINSIC_WO_CHAIN && VT.isInteger()) { - unsigned IntNo = cast(N0.getOperand(0))->getZExtValue(); - if (IntNo == Intrinsic::arm_neon_vabds) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), VT, - DAG.getConstant(Intrinsic::arm_neon_vabas, MVT::i32), - N1, N0.getOperand(1), N0.getOperand(2)); - if (IntNo == Intrinsic::arm_neon_vabdu) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), VT, - DAG.getConstant(Intrinsic::arm_neon_vabau, MVT::i32), - N1, N0.getOperand(1), N0.getOperand(2)); - } - return SDValue(); } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index fb51a2f873f..8c8d1d7b8a7 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -1288,6 +1288,24 @@ class N3VQMulOpSL16 op21_20, bits<4> op11_8, InstrItinClass itin, (ResTy (NEONvduplane (OpTy DPR_8:$src3), imm:$lane)))))))]>; +// Neon Intrinsic-Op instructions (VABA): double- and quad-register. +class N3VDIntOp op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, Intrinsic IntOp, SDNode OpNode> + : N3V; +class N3VQIntOp op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, Intrinsic IntOp, SDNode OpNode> + : N3V; + // Neon 3-argument intrinsics, both double- and quad-register. // The destination register is also used as the first source operand register. class N3VDInt3 op21_20, bits<4> op11_8, bit op4, @@ -1342,6 +1360,17 @@ class N3VLMulOpSL16 op21_20, bits<4> op11_8, (TyD (NEONvduplane (TyD DPR_8:$src3), imm:$lane))))))]>; +// Long Intrinsic-Op vector operations with explicit extend (VABAL). +class N3VLIntExtOp op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp, + SDNode OpNode> + : N3V; // Neon Long 3-argument intrinsic. The destination register is // a quad-register and is also used as the first source operand register. @@ -1433,6 +1462,19 @@ class N3VLExt op21_20, bits<4> op11_8, bit op4, let isCommutable = Commutable; } +// Long 3-register intrinsics with explicit extend (VABDL). +class N3VLIntExt op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp, + bit Commutable> + : N3V { + let isCommutable = Commutable; +} + // Long 3-register intrinsics. class N3VLInt op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, @@ -1918,6 +1960,21 @@ multiclass N3VLInt_QHS op11_8, bit op4, v8i16, v8i8, IntOp, Commutable>; } +// ....with explicit extend (VABDL). +multiclass N3VLIntExt_QHS op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + Intrinsic IntOp, SDNode ExtOp, bit Commutable = 0> { + def v8i16 : N3VLIntExt; + def v4i32 : N3VLIntExt; + def v2i64 : N3VLIntExt; +} + // Neon Wide 3-register vector intrinsics, // source operand element sizes of 8, 16 and 32 bits: @@ -1975,6 +2032,29 @@ multiclass N3VMulOpSL_HS op11_8, mul, ShOp>; } +// Neon Intrinsic-Op vector operations, +// element sizes of 8, 16 and 32 bits: +multiclass N3VIntOp_QHS op11_8, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, Intrinsic IntOp, + SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N3VDIntOp; + def v4i16 : N3VDIntOp; + def v2i32 : N3VDIntOp; + + // 128-bit vector types. + def v16i8 : N3VQIntOp; + def v8i16 : N3VQIntOp; + def v4i32 : N3VQIntOp; +} + // Neon 3-argument intrinsics, // element sizes of 8, 16 and 32 bits: multiclass N3VInt3_QHS op11_8, bit op4, @@ -2050,6 +2130,21 @@ multiclass N3VLInt3_QHS op11_8, bit op4, OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>; } +// ....with explicit extend (VABAL). +multiclass N3VLIntExtOp_QHS op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> { + def v8i16 : N3VLIntExtOp; + def v4i32 : N3VLIntExtOp; + def v2i64 : N3VLIntExtOp; +} + // Neon 2-register vector intrinsics, // element sizes of 8, 16 and 32 bits: @@ -2765,32 +2860,32 @@ def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1, // VABD : Vector Absolute Difference defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vabd", "s", int_arm_neon_vabds, 0>; + "vabd", "s", int_arm_neon_vabds, 1>; defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vabd", "u", int_arm_neon_vabdu, 0>; + "vabd", "u", int_arm_neon_vabdu, 1>; def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND, - "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>; + "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>; def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ, - "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>; + "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>; // VABDL : Vector Absolute Difference Long (Q = | D - D |) -defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vabdl", "s", int_arm_neon_vabdls, 0>; -defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vabdl", "u", int_arm_neon_vabdlu, 0>; +defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, + "vabdl", "s", int_arm_neon_vabds, zext, 1>; +defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, + "vabdl", "u", int_arm_neon_vabdu, zext, 1>; // VABA : Vector Absolute Difference and Accumulate -defm VABAs : N3VInt3_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, - "vaba", "s", int_arm_neon_vabas>; -defm VABAu : N3VInt3_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ, - "vaba", "u", int_arm_neon_vabau>; +defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, + "vaba", "s", int_arm_neon_vabds, add>; +defm VABAu : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ, + "vaba", "u", int_arm_neon_vabdu, add>; // VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |) -defm VABALs : N3VLInt3_QHS<0,1,0b0101,0, IIC_VABAD, IIC_VABAD, - "vabal", "s", int_arm_neon_vabals>; -defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, IIC_VABAD, IIC_VABAD, - "vabal", "u", int_arm_neon_vabalu>; +defm VABALs : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD, + "vabal", "s", int_arm_neon_vabds, zext, add>; +defm VABALu : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD, + "vabal", "u", int_arm_neon_vabdu, zext, add>; // Vector Maximum and Minimum. diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp index e625d413f5d..fd64460d6fd 100644 --- a/lib/VMCore/AutoUpgrade.cpp +++ b/lib/VMCore/AutoUpgrade.cpp @@ -81,21 +81,21 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { } else if (Name.compare(5, 9, "arm.neon.", 9) == 0) { if (((Name.compare(14, 5, "vmovl", 5) == 0 || Name.compare(14, 5, "vaddl", 5) == 0 || - Name.compare(14, 5, "vsubl", 5) == 0) && - (Name.compare(19, 2, "s.", 2) == 0 || - Name.compare(19, 2, "u.", 2) == 0)) || - - ((Name.compare(14, 5, "vaddw", 5) == 0 || - Name.compare(14, 5, "vsubw", 5) == 0) && - (Name.compare(19, 2, "s.", 2) == 0 || - Name.compare(19, 2, "u.", 2) == 0)) || - - ((Name.compare(14, 5, "vmull", 5) == 0 || + Name.compare(14, 5, "vsubl", 5) == 0 || + Name.compare(14, 5, "vaddw", 5) == 0 || + Name.compare(14, 5, "vsubw", 5) == 0 || + Name.compare(14, 5, "vmull", 5) == 0 || Name.compare(14, 5, "vmlal", 5) == 0 || - Name.compare(14, 5, "vmlsl", 5) == 0) && + Name.compare(14, 5, "vmlsl", 5) == 0 || + Name.compare(14, 5, "vabdl", 5) == 0 || + Name.compare(14, 5, "vabal", 5) == 0) && (Name.compare(19, 2, "s.", 2) == 0 || Name.compare(19, 2, "u.", 2) == 0)) || + (Name.compare(14, 4, "vaba", 4) == 0 && + (Name.compare(18, 2, "s.", 2) == 0 || + Name.compare(18, 2, "u.", 2) == 0)) || + (Name.compare(14, 6, "vmovn.", 6) == 0)) { // Calls to these are transformed into IR without intrinsics. @@ -391,6 +391,35 @@ static void ExtendNEONArgs(CallInst *CI, Value *Arg0, Value *Arg1, } } +/// CallVABD - As part of expanding a call to one of the old NEON vabdl, vaba, +/// or vabal intrinsics, construct a call to a vabd intrinsic. Examine the +/// name of the old intrinsic to determine whether to use a signed or unsigned +/// vabd intrinsic. Get the type from the old call instruction, adjusted for +/// half-size vector elements if the old intrinsic was vabdl or vabal. +static Instruction *CallVABD(CallInst *CI, Value *Arg0, Value *Arg1) { + Function *F = CI->getCalledFunction(); + const std::string& Name = F->getName(); + bool isLong = (Name.at(18) == 'l'); + bool isSigned = (Name.at(isLong ? 19 : 18) == 's'); + + Intrinsic::ID intID; + if (isSigned) + intID = Intrinsic::arm_neon_vabds; + else + intID = Intrinsic::arm_neon_vabdu; + + const Type *Ty = CI->getType(); + if (isLong) + Ty = VectorType::getTruncatedElementVectorType(cast(Ty)); + + Function *VABD = Intrinsic::getDeclaration(F->getParent(), intID, &Ty, 1); + Value *Operands[2]; + Operands[0] = Arg0; + Operands[1] = Arg1; + return CallInst::Create(VABD, Operands, Operands+2, + "upgraded."+CI->getName(), CI); +} + // UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the // upgraded intrinsic. All argument and return casting must be provided in // order to seamlessly integrate with existing context. @@ -434,6 +463,15 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI); NewI = BinaryOperator::CreateSub(CI->getArgOperand(0), MulI, "upgraded."+CI->getName(), CI); + } else if (Name.compare(14, 4, "vabd", 4) == 0) { + NewI = CallVABD(CI, CI->getArgOperand(0), CI->getArgOperand(1)); + NewI = new ZExtInst(NewI, CI->getType(), "upgraded."+CI->getName(), CI); + } else if (Name.compare(14, 4, "vaba", 4) == 0) { + NewI = CallVABD(CI, CI->getArgOperand(1), CI->getArgOperand(2)); + if (Name.at(18) == 'l') + NewI = new ZExtInst(NewI, CI->getType(), "", CI); + NewI = BinaryOperator::CreateAdd(CI->getArgOperand(0), NewI, + "upgraded."+CI->getName(), CI); } else if (Name.compare(14, 6, "vmovn.", 6) == 0) { NewI = new TruncInst(CI->getArgOperand(0), CI->getType(), "upgraded." + CI->getName(), CI); @@ -675,7 +713,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { } switch (NewFn->getIntrinsicID()) { - default: llvm_unreachable("Unknown function for CallInst upgrade."); + default: llvm_unreachable("Unknown function for CallInst upgrade."); case Intrinsic::arm_neon_vld1: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: diff --git a/test/Bitcode/neon-intrinsics.ll b/test/Bitcode/neon-intrinsics.ll index 9dda745bc7e..272cd424e2a 100644 --- a/test/Bitcode/neon-intrinsics.ll +++ b/test/Bitcode/neon-intrinsics.ll @@ -126,6 +126,44 @@ ; CHECK-NEXT: mul <2 x i64> ; CHECK-NEXT: sub <2 x i64> +; vaba should be auto-upgraded to vabd + add + +; CHECK: vabas32 +; CHECK-NOT: arm.neon.vabas.v2i32 +; CHECK: arm.neon.vabds.v2i32 +; CHECK-NEXT: add <2 x i32> + +; CHECK: vabaQu8 +; CHECK-NOT: arm.neon.vabau.v16i8 +; CHECK: arm.neon.vabdu.v16i8 +; CHECK-NEXT: add <16 x i8> + +; vabal should be auto-upgraded to vabd with zext + add + +; CHECK: vabals16 +; CHECK-NOT: arm.neon.vabals.v4i32 +; CHECK: arm.neon.vabds.v4i16 +; CHECK-NEXT: zext <4 x i16> +; CHECK-NEXT: add <4 x i32> + +; CHECK: vabalu32 +; CHECK-NOT: arm.neon.vabalu.v2i64 +; CHECK: arm.neon.vabdu.v2i32 +; CHECK-NEXT: zext <2 x i32> +; CHECK-NEXT: add <2 x i64> + +; vabdl should be auto-upgraded to vabd with zext + +; CHECK: vabdls8 +; CHECK-NOT: arm.neon.vabdls.v8i16 +; CHECK: arm.neon.vabds.v8i8 +; CHECK-NEXT: zext <8 x i8> + +; CHECK: vabdlu16 +; CHECK-NOT: arm.neon.vabdlu.v4i32 +; CHECK: arm.neon.vabdu.v4i16 +; CHECK-NEXT: zext <4 x i16> + ; vmovn should be auto-upgraded to trunc ; CHECK: vmovni16 diff --git a/test/Bitcode/neon-intrinsics.ll.bc b/test/Bitcode/neon-intrinsics.ll.bc index e925cea4de8a90efb6e3e456dc13151e22abe1fb..cabc3c9341362ad47ede7cc08c5907de591c3aac 100644 GIT binary patch literal 5764 zcmb_f4Nw&48Gd(f4^CM0fIB(DHCfpHe{>0-vgA<}022#z40-`Z# zLKZlTiYDg7Oj=_Htm8P7FemFq+*)+8#@qVUp zrN}vGTOiuT#AfTP8iOk;JI^FU?MzPX_K0j^a+N9alqPQF!RCYf$sr1HTn5C#-_id6 z;CLYDs~sW_Dnq9BJN*4JbO97_096>1ouY~2%If?uO%T+RrU@rC@uopKIX9lkv~1_h zT^dayo2uQ&zucu^65XfGv?l_e?2u>P&K;x^&&7(AnoZ@)_{cmPTWWu4KmYUV{Kh5+ zUF|4pa{kQgEYUfuh%&2{ zOScC*ZHY9?Mz!#@qI)Ut?S^LHB6)f-1sUtR;S&jg#v%QKLzr1 z$-Wj19p}_&((8*mY>EHo?mKfWq1GMo*jDx-1Z_%&eH~gFM;UD44ODu)nCFcBOVqaJ zLO1haUG}ivpzA!$am~6Kv;y=DJ*dTz?pU77gWd9N6wVpWbQE$pg6^n~>O|PQa#)YE z=BBR0JPwg8IoJa^031FT0tayde}X$pV*oTbwb`^9fX4^3sXzsQt%}K~?*l;F0qe4I zz>pbNBqK^OH$~~qW|L7%VuL&Pt7snI7y_Gk3C|vw0wPZMgw6;c$zYFTHKe9bpd;j% z*)pHwG9MEwBd-KT#5A+%mdtRC%&UX&?~ z%&ZDi8X>ulqDU<3`aL#|4@f&YoE3-(BY!i3Y?YDUk{c-$)bFtCDpKSb*k%&|V{x9HBkZh)k(Yru`%prrnA5U@+YtSSph|ho_0O zYW~Txk{f^#G01Gnm6deOkd<`HP*o#P(P8i^JVPY#xOp=8byGY;z{e0KFv?T(|Wjeqch7i)32g%oq%Yh zUvq%S)K6q8f2zBC-W+^^ql5z(l{++!T<(i|uWKGhu5TQ<730Wl9!KtxapVq-BNrmt zk3^r29^mi68``nt#-hJuPwtQQWvJkkFJV(lLND38IHbcj_03PioipWew4#TP4|%ALC?w81Q>|@Zbm@ zHxYRJ`0+y8P@mC;w*d-+>gB--PSz%TCraIrUP68%+IX_W->|Srz5G%GNdN%yogcbK z+sHjiWSK(Z$tdAQ=S~Ju%Dtp!Bk3{bYGF$#S1SGg>Lz0ScqTW4d^_RNLARNPSEAJm zehVk(0zY+QZNjG91ZVsUlwr}0hUOOs61a|JBObiO!=Zl2!7F_iOpy-WW8~ny6yaQW zZdZQ-HaQssag=ZX_v7y$sAU76RI4_jx+JG1LHvV%g$AqUrWX&Z(CVJgf z&9y7hZ|^rCksFvIjS>z&`|#-p>dyx;Twl?y51IIpiQ!B_4HaeVq~;q>QURlM`Ak8q zv2t;uC>Qz-C<@6AX+m`W0ZwzFIjq0gsJXq4xAj)~Cv?C4&vmpxP}Y{}A{Xa4u;#8oLLKup0l?;om7L<`>3q%GbA_c=W*g9~M7%>#Og6dHHMe zOVhe8481pV?}p27WgUL)i+8^M#g@lgg_^R|xocwtHQQyo+Lk%$ty_Bg zkf3I|Y!}-XDc>SmK}|#IvGParTiH2P{Rdhy7xWDC-NP{hIW=howQ0BXul2Q0;`D>w z#_Il$YrSl1hWAhr<-wxvW95Zcj(WFG$=m*C?c;-mz5bcJeOSNZO55wo5|-#AA$pHF zTiN4JzImpAqPVMVmJ&q^nO$EM&aUd8R_8rOXY`gQsXZ?55utT${=&3Hf2VW&+xpr& zl$aESn1uGMPZR`rb4F9~z^U>*4Q(0vK~?PF9RG#k-FfQ9>xI^;{*wVkYidTkPAYy} z?>*~#TR2oSS&@Rx?kK*gStl$=Tl5b)hpFn1sPJxIk{P#-QXYkE>!*E$xovZ0F4G%MKlSTCkX z3$d5ezbiy<=zn9>TbuMYCUJMpz_TS*p~t$k{>Jg?%asb-zbc&9S+qb&RSQkU7w2~F zdEJw#`VeATrarMxSu`UUf-U1v!>LQZoVMPJ3dgNmqFnT|6=czO@3_)-({M4nPqWUV z>=#Wd^_vvCB(welERoW>cD6*XuRT|30M`Y{02}YLUx)41?j1Ldf57RrfL(8_$5ohh zsys>Mp}L4Y^g>!e2O7Ry=?#}RY-!j0v_->co+sbDQ|+rn?J_h^oRVtLyev?B6P*CW z+X(*C%L;2@T`7e@@)vOv%~F!jRQzIZ7emuXOhR#_#QT%n%{#85X+5r8hj0-WE1CC}O3WRc z-pry;oIQ%fR+cOQ$=&n20!!b~c_$7TaW!Z3?9nNM-+AUnFhUa5we VjsC1~_P)Vs)!yFf)MMex;=j`vd5Zu5 literal 5016 zcmb_ee^8V68GrJV7YLGsB!-dVn-E+aq$Ui7_H36But7yb3_`0NLB)f$L2!1pf0Vvp zLal;I#9cSryV$y(cRQUw8nEcP@XJx4J5bolqMHos+2gw6)^mEh>-M~F^5Tn~tBBu& zd%yX9pXc*D&*%B^zInFLU$hc{Ng)7F3Lpo7n1-}gYrn8iEX{h#&+|&sMMQF{aNlN% zEa2z)?G`$h1jnZ%M`3Cd=6q5keD3hfl;>gNIFD$m(MX`(R7l|bOBwuSy5~i7(NmCC& zs3mFAI*ZeploD~n5#7f2)n0RTr6)#S_4nLt~d!28z87lS3%->Vzv46Xa0>qyF@ zX~-ctN2jZ4M}Go=%hc9bLz2`{^-ckvN-?z9%TXD5HY;L!Y%i79MU}H~>Mls{zF4p>t)^QAz zVeC5V;VJC5NbE`$=OvcIYC)8-Phx}=@?z3kMNw8iS;Dfqh6nVW+x|B(Yc2=)7Cch> zLo9gyOoEQF;K?l1TUeZt1mOl#`sqv#Tr2G-013UD-F&AbZ25N=HRE{vCfDoh1>3}MZD4FrW7xBX0vCE|emSr+Q+ z8K^`S>J1QDP~4>FiC8E@1j^fh=W+O-B;=jutM>uCue_w}ZeTz!DysgNzh-FtktDPE z0Opfk736GrYL4=7Y>xPIl!sr<5&t3N>ChQ|0vlk960V)=&d_Ii1w1jH(;O*YzCqIz zrfKpxdV5#o;03M<65u}F>3QgqA3eHV(>!$D^U$rGhi+gVx~Jx$J3S9w5TBfmi;bVK zJVI?q=b|ga`eJMHdYn5`0R7jLZmr6x!sf+Azjvawsi+kftQp!um*+Z<6U*~lfh_5t zS7@n7jeZ+z;4xwiKSr!ekBr5gan$lE-%ejGNl)&FnQ@RaH+kwIwx6+g7UcPY<+30z_ z&a)0P-EO5bt-|#0jskuQ4-7MS!$oM80{xcD@L6hG#v++E6*Nt=1eI`f3^}24}}$GbK(LN#8p88q~h}dw({T_V~eNSRFSP!Q6t_m0WtT>k3aFTeTa_*bUa7x!GA{&4BLFO9$RpVlSfjJ| zWromJF?l#&w9hs5^Ind8deLouwzvG2{lrFz)42Nt%B#4xG$0_nA`7Tn3wXdw%^^vA>3&DvUEjvxTUx->7;5U zm#Fk2hNvU^okqdONR(>BdmLJjvAny=1EZalG-;}msh@F43rdgyFU@lbroh%R-PGe6 zbT75oj@wU!eVo8iH~#a|tusn+EeQ6z@XI7;b z-9ZyCygj=fuF~~Ti2`yifmm{WCWC-E3L^N@@mYnvj8PHNC6_nWDXQS|BlvsiuDb9I zKR5Q=lF8Cs=hVtAu1l`83r}%9#MZDg9{%Vvj)_6{ziF=OG+500dXEeTxivh7*}0IQ z!Dfl?3pa4c!IOR0OLKRGKM2gSK8DQizta9X*RFZ=Or!=D(^=Myb+%}fXCc0Cql^QH z{lYZMGJASa``*#c1a72-hp-ZyQ>q5K_gyDaxWtZSU1J+9^|?OB;6J9f%}zJlBPn}1 i=9Fjc5&v&<{TE4k^6nmPxp{6}b2$5IzlVA21N;}y9<2HR diff --git a/test/CodeGen/ARM/vaba.ll b/test/CodeGen/ARM/vaba.ll index e7aa6aecb96..4fe1c434799 100644 --- a/test/CodeGen/ARM/vaba.ll +++ b/test/CodeGen/ARM/vaba.ll @@ -6,8 +6,9 @@ define <8 x i8> @vabas8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { %tmp1 = load <8 x i8>* %A %tmp2 = load <8 x i8>* %B %tmp3 = load <8 x i8>* %C - %tmp4 = call <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) - ret <8 x i8> %tmp4 + %tmp4 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3) + %tmp5 = add <8 x i8> %tmp1, %tmp4 + ret <8 x i8> %tmp5 } define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { @@ -16,8 +17,9 @@ define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind %tmp1 = load <4 x i16>* %A %tmp2 = load <4 x i16>* %B %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i16> %tmp4 + %tmp4 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3) + %tmp5 = add <4 x i16> %tmp1, %tmp4 + ret <4 x i16> %tmp5 } define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { @@ -26,8 +28,9 @@ define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind %tmp1 = load <2 x i32>* %A %tmp2 = load <2 x i32>* %B %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i32> %tmp4 + %tmp4 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3) + %tmp5 = add <2 x i32> %tmp1, %tmp4 + ret <2 x i32> %tmp5 } define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { @@ -36,8 +39,9 @@ define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { %tmp1 = load <8 x i8>* %A %tmp2 = load <8 x i8>* %B %tmp3 = load <8 x i8>* %C - %tmp4 = call <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) - ret <8 x i8> %tmp4 + %tmp4 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3) + %tmp5 = add <8 x i8> %tmp1, %tmp4 + ret <8 x i8> %tmp5 } define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { @@ -46,8 +50,9 @@ define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind %tmp1 = load <4 x i16>* %A %tmp2 = load <4 x i16>* %B %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i16> %tmp4 + %tmp4 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3) + %tmp5 = add <4 x i16> %tmp1, %tmp4 + ret <4 x i16> %tmp5 } define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { @@ -56,8 +61,9 @@ define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind %tmp1 = load <2 x i32>* %A %tmp2 = load <2 x i32>* %B %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i32> %tmp4 + %tmp4 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3) + %tmp5 = add <2 x i32> %tmp1, %tmp4 + ret <2 x i32> %tmp5 } define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { @@ -66,8 +72,9 @@ define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind %tmp1 = load <16 x i8>* %A %tmp2 = load <16 x i8>* %B %tmp3 = load <16 x i8>* %C - %tmp4 = call <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3) - ret <16 x i8> %tmp4 + %tmp4 = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %tmp2, <16 x i8> %tmp3) + %tmp5 = add <16 x i8> %tmp1, %tmp4 + ret <16 x i8> %tmp5 } define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { @@ -76,8 +83,9 @@ define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i16>* %B %tmp3 = load <8 x i16>* %C - %tmp4 = call <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3) - ret <8 x i16> %tmp4 + %tmp4 = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp3) + %tmp5 = add <8 x i16> %tmp1, %tmp4 + ret <8 x i16> %tmp5 } define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { @@ -86,8 +94,9 @@ define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind %tmp1 = load <4 x i32>* %A %tmp2 = load <4 x i32>* %B %tmp3 = load <4 x i32>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3) - ret <4 x i32> %tmp4 + %tmp4 = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %tmp2, <4 x i32> %tmp3) + %tmp5 = add <4 x i32> %tmp1, %tmp4 + ret <4 x i32> %tmp5 } define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { @@ -96,8 +105,9 @@ define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind %tmp1 = load <16 x i8>* %A %tmp2 = load <16 x i8>* %B %tmp3 = load <16 x i8>* %C - %tmp4 = call <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3) - ret <16 x i8> %tmp4 + %tmp4 = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %tmp2, <16 x i8> %tmp3) + %tmp5 = add <16 x i8> %tmp1, %tmp4 + ret <16 x i8> %tmp5 } define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { @@ -106,8 +116,9 @@ define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i16>* %B %tmp3 = load <8 x i16>* %C - %tmp4 = call <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3) - ret <8 x i16> %tmp4 + %tmp4 = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp3) + %tmp5 = add <8 x i16> %tmp1, %tmp4 + ret <8 x i16> %tmp5 } define <4 x i32> @vabaQu32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { @@ -116,25 +127,26 @@ define <4 x i32> @vabaQu32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind %tmp1 = load <4 x i32>* %A %tmp2 = load <4 x i32>* %B %tmp3 = load <4 x i32>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3) - ret <4 x i32> %tmp4 + %tmp4 = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %tmp2, <4 x i32> %tmp3) + %tmp5 = add <4 x i32> %tmp1, %tmp4 + ret <4 x i32> %tmp5 } -declare <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone +declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone +declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { ;CHECK: vabals8: @@ -142,8 +154,10 @@ define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i8>* %B %tmp3 = load <8 x i8>* %C - %tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) - ret <8 x i16> %tmp4 + %tmp4 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3) + %tmp5 = zext <8 x i8> %tmp4 to <8 x i16> + %tmp6 = add <8 x i16> %tmp1, %tmp5 + ret <8 x i16> %tmp6 } define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { @@ -152,8 +166,10 @@ define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind %tmp1 = load <4 x i32>* %A %tmp2 = load <4 x i16>* %B %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i32> %tmp4 + %tmp4 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3) + %tmp5 = zext <4 x i16> %tmp4 to <4 x i32> + %tmp6 = add <4 x i32> %tmp1, %tmp5 + ret <4 x i32> %tmp6 } define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { @@ -162,8 +178,10 @@ define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind %tmp1 = load <2 x i64>* %A %tmp2 = load <2 x i32>* %B %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i64> %tmp4 + %tmp4 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3) + %tmp5 = zext <2 x i32> %tmp4 to <2 x i64> + %tmp6 = add <2 x i64> %tmp1, %tmp5 + ret <2 x i64> %tmp6 } define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { @@ -172,8 +190,10 @@ define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i8>* %B %tmp3 = load <8 x i8>* %C - %tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) - ret <8 x i16> %tmp4 + %tmp4 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp3) + %tmp5 = zext <8 x i8> %tmp4 to <8 x i16> + %tmp6 = add <8 x i16> %tmp1, %tmp5 + ret <8 x i16> %tmp6 } define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { @@ -182,8 +202,10 @@ define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind %tmp1 = load <4 x i32>* %A %tmp2 = load <4 x i16>* %B %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i32> %tmp4 + %tmp4 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp2, <4 x i16> %tmp3) + %tmp5 = zext <4 x i16> %tmp4 to <4 x i32> + %tmp6 = add <4 x i32> %tmp1, %tmp5 + ret <4 x i32> %tmp6 } define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { @@ -192,38 +214,8 @@ define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind %tmp1 = load <2 x i64>* %A %tmp2 = load <2 x i32>* %B %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i64> %tmp4 -} - -declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone - -define <8 x i8> @vabd_combine_s8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vabd_combine_s8: -;CHECK: vaba.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - %tmp4 = add <8 x i8> %tmp2, %tmp3 - ret <8 x i8> %tmp4 + %tmp4 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp2, <2 x i32> %tmp3) + %tmp5 = zext <2 x i32> %tmp4 to <2 x i64> + %tmp6 = add <2 x i64> %tmp1, %tmp5 + ret <2 x i64> %tmp6 } - -define <4 x i16> @vabd_combine_u16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vabd_combine_u16: -;CHECK: vaba.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - %tmp4 = add <4 x i16> %tmp3, %tmp1 - ret <4 x i16> %tmp4 -} - -declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone - diff --git a/test/CodeGen/ARM/vabd.ll b/test/CodeGen/ARM/vabd.ll index 2b453936145..9ec734fa764 100644 --- a/test/CodeGen/ARM/vabd.ll +++ b/test/CodeGen/ARM/vabd.ll @@ -151,8 +151,9 @@ define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vabdl.s8 %tmp1 = load <8 x i8>* %A %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 + %tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 } define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { @@ -160,8 +161,9 @@ define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vabdl.s16 %tmp1 = load <4 x i16>* %A %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 + %tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 } define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { @@ -169,8 +171,9 @@ define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vabdl.s32 %tmp1 = load <2 x i32>* %A %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 + %tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 } define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { @@ -178,8 +181,9 @@ define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vabdl.u8 %tmp1 = load <8 x i8>* %A %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 + %tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 } define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { @@ -187,8 +191,9 @@ define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vabdl.u16 %tmp1 = load <4 x i16>* %A %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 + %tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 } define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { @@ -196,14 +201,7 @@ define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vabdl.u32 %tmp1 = load <2 x i32>* %A %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 + %tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 } - -declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone -- 2.34.1