From 13894fa135d33151072ddf5e80abe4540ec2afcd Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 24 Aug 2011 06:14:18 +0000 Subject: [PATCH] Break 256-bit vector int add/sub/mul into two 128-bit operations to avoid costly scalarization. Fixes PR10711. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138427 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 71 +++++++++++++++- lib/Target/X86/X86ISelLowering.h | 4 +- test/CodeGen/X86/avx-arith.ll | 128 +++++++++++++++++++++++++++++ 3 files changed, 200 insertions(+), 3 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 88566c687e2..a1f07329dbf 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -998,6 +998,21 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); + setOperationAction(ISD::ADD, MVT::v4i64, Custom); + setOperationAction(ISD::ADD, MVT::v8i32, Custom); + setOperationAction(ISD::ADD, MVT::v16i16, Custom); + setOperationAction(ISD::ADD, MVT::v32i8, Custom); + + setOperationAction(ISD::SUB, MVT::v4i64, Custom); + setOperationAction(ISD::SUB, MVT::v8i32, Custom); + setOperationAction(ISD::SUB, MVT::v16i16, Custom); + setOperationAction(ISD::SUB, MVT::v32i8, Custom); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Custom); + setOperationAction(ISD::MUL, MVT::v16i16, Custom); + // Don't lower v32i8 because there is no 128-bit byte mul + // Custom lower several nodes for 256-bit types. for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { @@ -9422,8 +9437,58 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { return Op; } -SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { +// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit +// ones, and then concatenate the result back. +static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + assert(VT.getSizeInBits() == 256 && VT.isInteger() && + "Unsupported value type for operation"); + + int NumElems = VT.getVectorNumElements(); + DebugLoc dl = Op.getDebugLoc(); + SDValue Idx0 = DAG.getConstant(0, MVT::i32); + SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + + // Extract the RHS vectors + SDValue RHS = Op.getOperand(1); + SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); + + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); +} + +SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType().getSizeInBits() == 256 && + Op.getValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + +SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType().getSizeInBits() == 256 && + Op.getValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + +SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); + + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.getSizeInBits() == 256) + return Lower256IntArith(Op, DAG); + assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); DebugLoc dl = Op.getDebugLoc(); @@ -10013,7 +10078,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); - case ISD::MUL: return LowerMUL_V2I64(Op, DAG); + case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, DAG); @@ -10029,6 +10094,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); + case ISD::ADD: return LowerADD(Op, DAG); + case ISD::SUB: return LowerSUB(Op, DAG); } } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 908157429ed..c13d0c762cc 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -819,7 +819,9 @@ namespace llvm { SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; diff --git a/test/CodeGen/X86/avx-arith.ll b/test/CodeGen/X86/avx-arith.ll index 553e8acda97..59988ca8b68 100644 --- a/test/CodeGen/X86/avx-arith.ll +++ b/test/CodeGen/X86/avx-arith.ll @@ -131,3 +131,131 @@ entry: } declare float @sqrtf(float) readnone + + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpaddq %xmm +; CHECK-NEXT: vpaddq %xmm +; CHECK-NEXT: vinsertf128 $1 +define <4 x i64> @vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone { + %x = add <4 x i64> %i, %j + ret <4 x i64> %x +} + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpaddd %xmm +; CHECK-NEXT: vpaddd %xmm +; CHECK-NEXT: vinsertf128 $1 +define <8 x i32> @vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone { + %x = add <8 x i32> %i, %j + ret <8 x i32> %x +} + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpaddw %xmm +; CHECK-NEXT: vpaddw %xmm +; CHECK-NEXT: vinsertf128 $1 +define <16 x i16> @vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone { + %x = add <16 x i16> %i, %j + ret <16 x i16> %x +} + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpaddb %xmm +; CHECK-NEXT: vpaddb %xmm +; CHECK-NEXT: vinsertf128 $1 +define <32 x i8> @vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone { + %x = add <32 x i8> %i, %j + ret <32 x i8> %x +} + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpsubq %xmm +; CHECK-NEXT: vpsubq %xmm +; CHECK-NEXT: vinsertf128 $1 +define <4 x i64> @vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone { + %x = sub <4 x i64> %i, %j + ret <4 x i64> %x +} + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpsubd %xmm +; CHECK-NEXT: vpsubd %xmm +; CHECK-NEXT: vinsertf128 $1 +define <8 x i32> @vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone { + %x = sub <8 x i32> %i, %j + ret <8 x i32> %x +} + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpsubw %xmm +; CHECK-NEXT: vpsubw %xmm +; CHECK-NEXT: vinsertf128 $1 +define <16 x i16> @vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone { + %x = sub <16 x i16> %i, %j + ret <16 x i16> %x +} + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpsubb %xmm +; CHECK-NEXT: vpsubb %xmm +; CHECK-NEXT: vinsertf128 $1 +define <32 x i8> @vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone { + %x = sub <32 x i8> %i, %j + ret <32 x i8> %x +} + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpmulld %xmm +; CHECK-NEXT: vpmulld %xmm +; CHECK-NEXT: vinsertf128 $1 +define <8 x i32> @vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone { + %x = mul <8 x i32> %i, %j + ret <8 x i32> %x +} + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpmullw %xmm +; CHECK-NEXT: vpmullw %xmm +; CHECK-NEXT: vinsertf128 $1 +define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone { + %x = mul <16 x i16> %i, %j + ret <16 x i16> %x +} + +; CHECK: vextractf128 $1 +; CHECK-NEXT: vextractf128 $1 +; CHECK-NEXT: vpmuludq %xmm +; CHECK-NEXT: vpsrlq $32, %xmm +; CHECK-NEXT: vpmuludq %xmm +; CHECK-NEXT: vpsllq $32, %xmm +; CHECK-NEXT: vpaddq %xmm +; CHECK-NEXT: vpmuludq %xmm +; CHECK-NEXT: vpsrlq $32, %xmm +; CHECK-NEXT: vpmuludq %xmm +; CHECK-NEXT: vpsllq $32, %xmm +; CHECK-NEXT: vpsrlq $32, %xmm +; CHECK-NEXT: vpmuludq %xmm +; CHECK-NEXT: vpsllq $32, %xmm +; CHECK-NEXT: vpaddq %xmm +; CHECK-NEXT: vpaddq %xmm +; CHECK-NEXT: vpsrlq $32, %xmm +; CHECK-NEXT: vpmuludq %xmm +; CHECK-NEXT: vpsllq $32, %xmm +; CHECK-NEXT: vpaddq %xmm +; CHECK-NEXT: vinsertf128 $1 +define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone { + %x = mul <4 x i64> %i, %j + ret <4 x i64> %x +} + -- 2.34.1