AMDGPU: Split x8 and x16 vector loads instead of scalarize

[oota-llvm.git] / lib / Target / AMDGPU / AMDGPUISelLowering.cpp
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index af9fcbde9f1a83fe5827aa8a396c86a66a88592e..b73172cec26dc26b91cacf7d6fc19290e082c4cf 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -394,6 +394,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
  
    setFsqrtIsCheap(true);
  
+  // We want to find all load dependencies for long chains of stores to enable
+  // merging into very wide vectors. The problem is with vectors with > 4
+  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+  // vectors are a legal type, even though we have to split the loads
+  // usually. When we can more precisely specify load legality per address
+  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+  // smarter so that they can figure out what to do in 2 iterations without all
+  // N > 4 stores on the same chain.
+  GatherAllAliasesMaxDepth = 16;
+
    // FIXME: Need to really handle these.
    MaxStoresPerMemcpy  = 4096;
    MaxStoresPerMemmove = 4096;