Codegen allonesvector better while using AVX: vpcmpeqd + vinsertf128

author Bruno Cardoso Lopes <bruno.cardoso@gmail.com>

Mon, 25 Jul 2011 23:05:32 +0000 (23:05 +0000)

committer Bruno Cardoso Lopes <bruno.cardoso@gmail.com>

Mon, 25 Jul 2011 23:05:32 +0000 (23:05 +0000)
author Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Mon, 25 Jul 2011 23:05:32 +0000 (23:05 +0000)
committer Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Mon, 25 Jul 2011 23:05:32 +0000 (23:05 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 2f74c0fdd46bc684d5c657b3292bcf87281ef5b6..793770aa50ae0486a1ad5b2b36dc5d63af095177 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3831,21 +3831,25 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
  }
  
  /// getOnesVector - Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32> or <8 x i32> bitcasted to
-/// their original type, ensuring they get CSE'd.
+/// Always build ones vectors as <4 x i32>. For 256-bit types, use two
+/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their
+/// original type, ensuring they get CSE'd.
  static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
    assert((VT.is128BitVector() || VT.is256BitVector())
           && "Expected a 128-bit or 256-bit vector type");
  
    SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                            Cst, Cst, Cst, Cst);
  
-  SDValue Vec;
    if (VT.is256BitVector()) {
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
-  } else
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
+                              Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
+    Vec = Insert128BitVector(InsV, Vec,
+                  DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+  }
+
    return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
  }
  
@@ -12023,6 +12027,35 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
+/// so it can be folded inside ANDNP.
+static bool CanFoldXORWithAllOnes(const SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  // Match direct AllOnes for 128 and 256-bit vectors
+  if (ISD::isBuildVectorAllOnes(N))
+    return true;
+
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  // Sometimes the operand may come from a insert_subvector building a 256-bit
+  // allones vector
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+
+  if (VT.getSizeInBits() == 256 &&
+      N->getOpcode() == ISD::INSERT_SUBVECTOR &&
+      V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+      ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+      ISD::isBuildVectorAllOnes(V2.getNode()))
+    return true;
+
+  return false;
+}
+
  static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
@@ -12047,12 +12080,14 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
  
    // Check LHS for vnot
    if (N0.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
      return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
  
    // Check RHS for vnot
    if (N1.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
      return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
  
    return SDValue();
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index 1ab02780cdfc3ef4d649cf6b039d6e7524b3632b..fda04749c0165b5acadb81076c8e59975cee5039 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2450,6 +2450,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
      case X86::AVX_SET0PS:
      case X86::AVX_SET0PD:
      case X86::AVX_SET0PI:
+    case X86::AVX_SETALLONES:
        Alignment = 16;
        break;
      case X86::FsFLD0SD:
@@ -2494,6 +2495,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
    case X86::AVX_SET0PI:
    case X86::AVX_SET0PSY:
    case X86::AVX_SET0PDY:
+  case X86::AVX_SETALLONES:
    case X86::FsFLD0SD:
    case X86::FsFLD0SS:
    case X86::VFsFLD0SD:
@@ -2531,9 +2533,10 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
        Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
      else
        Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
-    const Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
-                    Constant::getAllOnesValue(Ty) :
-                    Constant::getNullValue(Ty);
+
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES);
+    const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+                                    Constant::getNullValue(Ty);
      unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
  
      // Create operands to load from the constant pool entry.
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index d574a7b8db319b0e8d62f095534fe9ee3ec6627b..b5ac5feb8c743edabd6f065da3bb40aa4d63937f 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3143,11 +3143,17 @@ def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
  // Alias instructions that map zero vector to pxor / xorp* for sse.
  // We set canFoldAsLoad because this can be converted to a constant-pool
  // load of an all-ones value if folding it would be beneficial.
+// FIXME: Change encoding to pseudo! This is blocked right now by the x86
+// JIT implementation, it does not expand the instructions below like
+// X86MCInstLower does.
  let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
      isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
-  // FIXME: Change encoding to pseudo.
    def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
                           [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in
+  def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
  
  //===---------------------------------------------------------------------===//
  // SSE3 - Conversion Instructions
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp

index e3853355553461289453b2b186bac1bc3f3588e3..2ed596af15e2c0d662a43c7102c9dee76af774f9 100644 (file)
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -381,6 +381,7 @@ ReSimplify:
    case X86::AVX_SET0PD:    LowerUnaryToTwoAddr(OutMI, X86::VXORPDrr); break;
    case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
    case X86::AVX_SET0PI:    LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
+  case X86::AVX_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;
  
    case X86::MOV16r0:
      LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
diff --git a/test/CodeGen/X86/avx-256.ll b/test/CodeGen/X86/avx-256.ll

index a6d1450c9c19dfef9c72f3697b8e8321d4d12868..244bf98ce6802b08a5adb7f5c92cca9ebb69b7ae 100644 (file)
--- a/test/CodeGen/X86/avx-256.ll
+++ b/test/CodeGen/X86/avx-256.ll
@@ -12,3 +12,15 @@ entry:
    store <4 x double> zeroinitializer, <4 x double>* @y, align 32
    ret void
  }
+
+; CHECK: vpcmpeqd
+; CHECK: vinsertf128 $1
+define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind {
+allocas:
+  %ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>*
+  store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
+0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
+0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000>, <8 x
+float>* %ptr2vec615, align 32
+  ret void
+}
author	Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
	Mon, 25 Jul 2011 23:05:32 +0000 (23:05 +0000)
committer	Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
	Mon, 25 Jul 2011 23:05:32 +0000 (23:05 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
lib/Target/X86/X86MCInstLower.cpp		patch \| blob \| history
test/CodeGen/X86/avx-256.ll		patch \| blob \| history