[PowerPC] Enable interleaved-access vectorization

[oota-llvm.git] / lib / Target / AMDGPU / SIISelLowering.cpp
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 25798c42a8c9f88b7f3916a33e360b043fefabb0..e7dcd8a32679c5fb7097819cf799bb7c59ecf9ae 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -261,6 +261,41 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
    return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
  }
  
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+  // additionally can do r + r + i with addr64. 32-bit has more addressing
+  // mode options. Depending on the resource constant, it can also do
+  // (i64 r0) + (i32 r1) * (i14 i).
+  //
+  // Private arrays end up using a scratch buffer most of the time, so also
+  // assume those use MUBUF instructions. Scratch loads / stores are currently
+  // implemented as mubuf instructions with offen bit set, so slightly
+  // different than the normal addr64.
+  if (!isUInt<12>(AM.BaseOffs))
+    return false;
+
+  // FIXME: Since we can split immediate into soffset and immediate offset,
+  // would it make sense to allow any immediate?
+
+  switch (AM.Scale) {
+  case 0: // r + i or just i, depending on HasBaseReg.
+    return true;
+  case 1:
+    return true; // We have r + r or r + i.
+  case 2:
+    if (AM.HasBaseReg) {
+      // Reject 2 * r + r.
+      return false;
+    }
+
+    // Allow 2 * r as r + r
+    // Or  2 * r + i is allowed as r + r + i.
+    return true;
+  default: // Don't allow n * r
+    return false;
+  }
+}
+
  bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                               const AddrMode &AM, Type *Ty,
                                               unsigned AS) const {
@@ -269,7 +304,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
      return false;
  
    switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS: {
      if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
        // Assume the we will use FLAT for all global memory accesses
        // on VI.
@@ -282,51 +317,51 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
        // because it has never been validated.
        return isLegalFlatAddressingMode(AM);
      }
-    // fall-through
-  case AMDGPUAS::PRIVATE_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
-    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
-    // additionally can do r + r + i with addr64. 32-bit has more addressing
-    // mode options. Depending on the resource constant, it can also do
-    // (i64 r0) + (i32 r1) * (i14 i).
-    //
-    // SMRD instructions have an 8-bit, dword offset.
-    //
-    // Assume nonunifom access, since the address space isn't enough to know
-    // what instruction we will use, and since we don't know if this is a load
-    // or store and scalar stores are only available on VI.
-    //
-    // We also know if we are doing an extload, we can't do a scalar load.
-    //
-    // Private arrays end up using a scratch buffer most of the time, so also
-    // assume those use MUBUF instructions. Scratch loads / stores are currently
-    // implemented as mubuf instructions with offen bit set, so slightly
-    // different than the normal addr64.
-    if (!isUInt<12>(AM.BaseOffs))
-      return false;
  
-    // FIXME: Since we can split immediate into soffset and immediate offset,
-    // would it make sense to allow any immediate?
+    return isLegalMUBUFAddressingMode(AM);
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    // If the offset isn't a multiple of 4, it probably isn't going to be
+    // correctly aligned.
+    if (AM.BaseOffs % 4 != 0)
+      return isLegalMUBUFAddressingMode(AM);
+
+    // There are no SMRD extloads, so if we have to do a small type access we
+    // will use a MUBUF load.
+    // FIXME?: We also need to do this if unaligned, but we don't know the
+    // alignment here.
+    if (DL.getTypeStoreSize(Ty) < 4)
+      return isLegalMUBUFAddressingMode(AM);
+
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      // SMRD instructions have an 8-bit, dword offset on SI.
+      if (!isUInt<8>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+      // On CI+, this can also be a 32-bit literal constant offset. If it fits
+      // in 8-bits, it can use a smaller encoding.
+      if (!isUInt<32>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+      if (!isUInt<20>(AM.BaseOffs))
+        return false;
+    } else
+      llvm_unreachable("unhandled generation");
  
-    switch (AM.Scale) {
-    case 0: // r + i or just i, depending on HasBaseReg.
+    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
        return true;
-    case 1:
-      return true; // We have r + r or r + i.
-    case 2:
-      if (AM.HasBaseReg) {
-        // Reject 2 * r + r.
-        return false;
-      }
  
-      // Allow 2 * r as r + r
-      // Or  2 * r + i is allowed as r + r + i.
+    if (AM.Scale == 1 && AM.HasBaseReg)
        return true;
-    default: // Don't allow n * r
-      return false;
-    }
+
+    return false;
    }
+
+  case AMDGPUAS::PRIVATE_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+    return isLegalMUBUFAddressingMode(AM);
+
    case AMDGPUAS::LOCAL_ADDRESS:
    case AMDGPUAS::REGION_ADDRESS: {
      // Basic, single offset DS instructions allow a 16-bit unsigned immediate
@@ -374,7 +409,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
      // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
      // aligned, 8 byte access in a single operation using ds_read2/write2_b32
      // with adjacent offsets.
-    return Align % 4 == 0;
+    bool AlignedBy4 = (Align % 4 == 0);
+    if (IsFast)
+      *IsFast = AlignedBy4;
+    return AlignedBy4;
    }
  
    // Smaller than dword value must be aligned.
@@ -513,7 +551,7 @@ SDValue SITargetLowering::LowerFormalArguments(
        assert((PSInputNum <= 15) && "Too many PS inputs!");
  
        if (!Arg.Used) {
-        // We can savely skip PS inputs
+        // We can safely skip PS inputs
          Skipped.set(i);
          ++PSInputNum;
          continue;
@@ -530,7 +568,7 @@ SDValue SITargetLowering::LowerFormalArguments(
  
        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
        // three or five element vertex only needs three or five registers,
-      // NOT four or eigth.
+      // NOT four or eight.
        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
        unsigned NumElements = ParamType->getVectorNumElements();
  
@@ -617,7 +655,7 @@ SDValue SITargetLowering::LowerFormalArguments(
                                     Offset, Ins[i].Flags.isSExt());
        Chains.push_back(Arg.getValue(1));
  
-      const PointerType *ParamTy =
+      auto *ParamTy =
          dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
        if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
            ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
@@ -2213,9 +2251,9 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
  }
  
  /// \brief Return a resource descriptor with the 'Add TID' bit enabled
-///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
-///        of the resource descriptor) to create an offset, which is added to the
-///        resource ponter.
+///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+///        of the resource descriptor) to create an offset, which is added to
+///        the resource pointer.
  MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
                                             SDLoc DL,
                                             SDValue Ptr,