From ecebcfc3a1e9e115f04c2610779d6020f8e1d03b Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Thu, 3 Sep 2015 21:23:18 +0000 Subject: [PATCH] [PowerPC] Include the permutation cost for unaligned vector loads Pre-P8, when we generate code for unaligned vector loads (for Altivec and QPX types), even when accounting for the combining that takes place for multiple consecutive such loads, there is at least one load instructions and one permutation for each load. Make sure the cost reported reflects the cost of the permutes as well. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246807 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 20 +++++++++------- test/Analysis/CostModel/PowerPC/load_store.ll | 2 +- .../CostModel/PowerPC/unal-vec-ldst.ll | 24 +++++++++---------- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 47469da3f6c..937d9c6810d 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -333,6 +333,18 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, bool IsQPXType = ST->hasQPX() && (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); + // If we can use the permutation-based load sequence, then this is also + // relatively cheap (not counting loop-invariant instructions): one load plus + // one permute (the last load in a series has extra cost, but we're + // neglecting that here). Note that on the P7, we should do unaligned loads + // for Altivec types using the VSX instructions, but that's more expensive + // than using the permutation-based load sequence. On the P8, that's no + // longer true. + if (Opcode == Instruction::Load && + ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && + Alignment >= LT.second.getScalarType().getStoreSize()) + return Cost + LT.first; // Add the cost of the permutations. + // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the // P7, unaligned vector loads are more expensive than the permutation-based // load sequence, so that might be used instead, but regardless, the net cost @@ -340,14 +352,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, if (IsVSXType || (ST->hasVSX() && IsAltivecType)) return Cost; - // If we can use the permutation-based load sequence, then this is also - // relatively cheap (not counting loop-invariant instructions). - bool PermutationLoad = Opcode == Instruction::Load && - (IsAltivecType || IsQPXType) && - Alignment >= LT.second.getScalarType().getStoreSize(); - if (PermutationLoad) - return Cost; - // PPC in general does not support unaligned loads and stores. They'll need // to be decomposed based on the alignment factor. diff --git a/test/Analysis/CostModel/PowerPC/load_store.ll b/test/Analysis/CostModel/PowerPC/load_store.ll index 9501deedeaa..0a568b88e72 100644 --- a/test/Analysis/CostModel/PowerPC/load_store.ll +++ b/test/Analysis/CostModel/PowerPC/load_store.ll @@ -34,7 +34,7 @@ define i32 @loads(i32 %arg) { ; CHECK: cost of 48 {{.*}} load load <4 x i16>, <4 x i16>* undef, align 2 - ; CHECK: cost of 1 {{.*}} load + ; CHECK: cost of 2 {{.*}} load load <4 x i32>, <4 x i32>* undef, align 4 ; CHECK: cost of 46 {{.*}} load diff --git a/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll b/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll index b983d84ce72..3b1bc3b3fdb 100644 --- a/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll +++ b/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll @@ -8,7 +8,7 @@ entry: ret <16 x i8> %r ; CHECK-LABEL: test_l_v16i8 -; CHECK: cost of 1 for instruction: %r = load <16 x i8>, <16 x i8>* %p, align 1 +; CHECK: cost of 2 for instruction: %r = load <16 x i8>, <16 x i8>* %p, align 1 } define <32 x i8> @test_l_v32i8(<32 x i8>* %p) #0 { @@ -17,7 +17,7 @@ entry: ret <32 x i8> %r ; CHECK-LABEL: test_l_v32i8 -; CHECK: cost of 2 for instruction: %r = load <32 x i8>, <32 x i8>* %p, align 1 +; CHECK: cost of 4 for instruction: %r = load <32 x i8>, <32 x i8>* %p, align 1 } define <8 x i16> @test_l_v8i16(<8 x i16>* %p) #0 { @@ -26,7 +26,7 @@ entry: ret <8 x i16> %r ; CHECK-LABEL: test_l_v8i16 -; CHECK: cost of 1 for instruction: %r = load <8 x i16>, <8 x i16>* %p, align 2 +; CHECK: cost of 2 for instruction: %r = load <8 x i16>, <8 x i16>* %p, align 2 } define <16 x i16> @test_l_v16i16(<16 x i16>* %p) #0 { @@ -35,7 +35,7 @@ entry: ret <16 x i16> %r ; CHECK-LABEL: test_l_v16i16 -; CHECK: cost of 2 for instruction: %r = load <16 x i16>, <16 x i16>* %p, align 2 +; CHECK: cost of 4 for instruction: %r = load <16 x i16>, <16 x i16>* %p, align 2 } define <4 x i32> @test_l_v4i32(<4 x i32>* %p) #0 { @@ -44,7 +44,7 @@ entry: ret <4 x i32> %r ; CHECK-LABEL: test_l_v4i32 -; CHECK: cost of 1 for instruction: %r = load <4 x i32>, <4 x i32>* %p, align 4 +; CHECK: cost of 2 for instruction: %r = load <4 x i32>, <4 x i32>* %p, align 4 } define <8 x i32> @test_l_v8i32(<8 x i32>* %p) #0 { @@ -53,7 +53,7 @@ entry: ret <8 x i32> %r ; CHECK-LABEL: test_l_v8i32 -; CHECK: cost of 2 for instruction: %r = load <8 x i32>, <8 x i32>* %p, align 4 +; CHECK: cost of 4 for instruction: %r = load <8 x i32>, <8 x i32>* %p, align 4 } define <2 x i64> @test_l_v2i64(<2 x i64>* %p) #0 { @@ -80,7 +80,7 @@ entry: ret <4 x float> %r ; CHECK-LABEL: test_l_v4float -; CHECK: cost of 1 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4 +; CHECK: cost of 2 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4 } define <8 x float> @test_l_v8float(<8 x float>* %p) #0 { @@ -89,7 +89,7 @@ entry: ret <8 x float> %r ; CHECK-LABEL: test_l_v8float -; CHECK: cost of 2 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4 +; CHECK: cost of 4 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4 } define <2 x double> @test_l_v2double(<2 x double>* %p) #0 { @@ -224,7 +224,7 @@ entry: ret <4 x float> %r ; CHECK-LABEL: test_l_qv4float -; CHECK: cost of 1 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4 +; CHECK: cost of 2 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4 } define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 { @@ -233,7 +233,7 @@ entry: ret <8 x float> %r ; CHECK-LABEL: test_l_qv8float -; CHECK: cost of 2 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4 +; CHECK: cost of 4 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4 } define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 { @@ -242,7 +242,7 @@ entry: ret <4 x double> %r ; CHECK-LABEL: test_l_qv4double -; CHECK: cost of 1 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8 +; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8 } define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 { @@ -251,7 +251,7 @@ entry: ret <8 x double> %r ; CHECK-LABEL: test_l_qv8double -; CHECK: cost of 2 for instruction: %r = load <8 x double>, <8 x double>* %p, align 8 +; CHECK: cost of 4 for instruction: %r = load <8 x double>, <8 x double>* %p, align 8 } define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { -- 2.34.1