From: Bruno Cardoso Lopes Date: Thu, 12 Aug 2010 02:06:36 +0000 (+0000) Subject: Begin to support some vector operations for AVX 256-bit intructions. The long X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=8c05a850f43cda2e62ac48562281f504ab763d59;p=oota-llvm.git Begin to support some vector operations for AVX 256-bit intructions. The long term goal here is to be able to match enough of vector_shuffle and build_vector so all avx intrinsics which aren't mapped to their own built-ins but to shufflevector calls can be codegen'd. This is the first (baby) step, support building zeroed vectors. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@110897 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 41f630f4a99..15bbf070f58 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -883,7 +883,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); - //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); @@ -3412,18 +3412,27 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest - // type. This ensures they get CSE'd. + // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted + // to their dest type. This ensures they get CSE'd. SDValue Vec; if (VT.getSizeInBits() == 64) { // MMX SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); - } else if (HasSSE2) { // SSE2 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - } else { // SSE1 + } else if (VT.getSizeInBits() == 128) { + if (HasSSE2) { // SSE2 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + } else { // SSE1 + SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); + } + } else if (VT.getSizeInBits() == 256) { // AVX + // 256-bit logic and arithmetic instructions in AVX are + // all floating-point, no support for integer ops. Default + // to emitting fp zeroed vectors then. SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); } return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); } @@ -3437,9 +3446,9 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { // type. This ensures they get CSE'd. SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); SDValue Vec; - if (VT.getSizeInBits() == 64) // MMX + if (VT.getSizeInBits() == 64) // MMX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); - else // SSE + else // SSE Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); } @@ -3844,9 +3853,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); - // All zero's are handled with pxor, all one's are handled with pcmpeqd. - if (ISD::isBuildVectorAllZeros(Op.getNode()) - || ISD::isBuildVectorAllOnes(Op.getNode())) { + // All zero's are handled with pxor in SSE2 and above, xorps in SSE1 and + // all one's are handled with pcmpeqd. In AVX, zero's are handled with + // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd + // is present, so AllOnes is ignored. + if (ISD::isBuildVectorAllZeros(Op.getNode()) || + (Op.getValueType().getSizeInBits() != 256 && + ISD::isBuildVectorAllOnes(Op.getNode()))) { // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are // eliminated on x86-32 hosts. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index e24e5530c5f..d3e2181d7c6 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2186,6 +2186,14 @@ def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllZerosV))]>; } +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isCodeGenOnly = 1, Predicates = [HasAVX] in { +def V_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V; +def V_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V; +} + def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>; def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>; def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>; diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index ae20e689ef8..e77c7c7e575 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -374,12 +374,14 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { case X86::MMX_V_SET0: LowerUnaryToTwoAddr(OutMI, X86::MMX_PXORrr); break; case X86::MMX_V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::MMX_PCMPEQDrr); break; - case X86::FsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; - case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; - case X86::V_SET0PS: LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break; - case X86::V_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break; - case X86::V_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; - case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break; + case X86::FsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; + case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; + case X86::V_SET0PS: LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break; + case X86::V_SET0PSY: LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break; + case X86::V_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break; + case X86::V_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break; + case X86::V_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; + case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break; case X86::MOV16r0: LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0 diff --git a/test/CodeGen/X86/avx-256.ll b/test/CodeGen/X86/avx-256.ll new file mode 100644 index 00000000000..20d31e73885 --- /dev/null +++ b/test/CodeGen/X86/avx-256.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7 -mattr=avx | FileCheck %s + +@x = common global <8 x float> zeroinitializer, align 32 +@y = common global <4 x double> zeroinitializer, align 32 + +define void @zero() nounwind ssp { +entry: + ; CHECK: vxorps + ; CHECK: vmovaps + ; CHECK: vmovaps + store <8 x float> zeroinitializer, <8 x float>* @x, align 32 + store <4 x double> zeroinitializer, <4 x double>* @y, align 32 + ret void +} +