From ac515c408782decf9ce1dff2cfc736dc790bd244 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 1 Sep 2015 20:51:51 +0000 Subject: [PATCH] rename "slow-unaligned-mem-under-32" to slow-unaligned-mem-16" (NFCI) This is a follow-on suggested by: http://reviews.llvm.org/D12154 ( http://reviews.llvm.org/rL245729 ) http://reviews.llvm.org/D10662 ( http://reviews.llvm.org/rL245075 ) This makes the attribute name match most of the existing lowering logic and regression test expectations. But the current use of this attribute is inconsistent; see the FIXME comment for "allowsMisalignedMemoryAccesses()". That change will result in functional changes and should be coming soon. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246585 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86.td | 90 ++++++++++++++++-------------- lib/Target/X86/X86ISelLowering.cpp | 6 +- lib/Target/X86/X86InstrInfo.cpp | 6 +- lib/Target/X86/X86Subtarget.cpp | 4 +- lib/Target/X86/X86Subtarget.h | 6 +- 5 files changed, 59 insertions(+), 53 deletions(-) diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index d00a1113e2e..3a3b03874c0 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -79,9 +79,10 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; -def FeatureSlowUAMem : SubtargetFeature<"slow-unaligned-mem-under-32", - "IsUAMemUnder32Slow", "true", - "Slow unaligned 16-byte-or-less memory access">; +// FIXME: This should not apply to CPUs that do not have SSE. +def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16", + "IsUAMem16Slow", "true", + "Slow unaligned 16-byte memory access">; def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", "IsUAMem32Slow", "true", "Slow unaligned 32-byte memory access">; @@ -209,42 +210,45 @@ def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", class Proc Features> : ProcessorModel; -def : Proc<"generic", [FeatureSlowUAMem]>; -def : Proc<"i386", [FeatureSlowUAMem]>; -def : Proc<"i486", [FeatureSlowUAMem]>; -def : Proc<"i586", [FeatureSlowUAMem]>; -def : Proc<"pentium", [FeatureSlowUAMem]>; -def : Proc<"pentium-mmx", [FeatureSlowUAMem, FeatureMMX]>; -def : Proc<"i686", [FeatureSlowUAMem]>; -def : Proc<"pentiumpro", [FeatureSlowUAMem, FeatureCMOV]>; -def : Proc<"pentium2", [FeatureSlowUAMem, FeatureMMX, FeatureCMOV]>; -def : Proc<"pentium3", [FeatureSlowUAMem, FeatureSSE1]>; -def : Proc<"pentium3m", [FeatureSlowUAMem, FeatureSSE1, FeatureSlowBTMem]>; -def : Proc<"pentium-m", [FeatureSlowUAMem, FeatureSSE2, FeatureSlowBTMem]>; -def : Proc<"pentium4", [FeatureSlowUAMem, FeatureSSE2]>; -def : Proc<"pentium4m", [FeatureSlowUAMem, FeatureSSE2, FeatureSlowBTMem]>; +def : Proc<"generic", [FeatureSlowUAMem16]>; +def : Proc<"i386", [FeatureSlowUAMem16]>; +def : Proc<"i486", [FeatureSlowUAMem16]>; +def : Proc<"i586", [FeatureSlowUAMem16]>; +def : Proc<"pentium", [FeatureSlowUAMem16]>; +def : Proc<"pentium-mmx", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"i686", [FeatureSlowUAMem16]>; +def : Proc<"pentiumpro", [FeatureSlowUAMem16, FeatureCMOV]>; +def : Proc<"pentium2", [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV]>; +def : Proc<"pentium3", [FeatureSlowUAMem16, FeatureSSE1]>; +def : Proc<"pentium3m", [FeatureSlowUAMem16, FeatureSSE1, + FeatureSlowBTMem]>; +def : Proc<"pentium-m", [FeatureSlowUAMem16, FeatureSSE2, + FeatureSlowBTMem]>; +def : Proc<"pentium4", [FeatureSlowUAMem16, FeatureSSE2]>; +def : Proc<"pentium4m", [FeatureSlowUAMem16, FeatureSSE2, + FeatureSlowBTMem]>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, - [FeatureSlowUAMem, FeatureSSE3, FeatureSlowBTMem]>; + [FeatureSlowUAMem16, FeatureSSE3, FeatureSlowBTMem]>; // NetBurst. -def : Proc<"prescott", [FeatureSlowUAMem, FeatureSSE3, FeatureSlowBTMem]>; -def : Proc<"nocona", [FeatureSlowUAMem, FeatureSSE3, FeatureCMPXCHG16B, +def : Proc<"prescott", [FeatureSlowUAMem16, FeatureSSE3, FeatureSlowBTMem]>; +def : Proc<"nocona", [FeatureSlowUAMem16, FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; // Intel Core 2 Solo/Duo. def : ProcessorModel<"core2", SandyBridgeModel, - [FeatureSlowUAMem, FeatureSSSE3, FeatureCMPXCHG16B, + [FeatureSlowUAMem16, FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; def : ProcessorModel<"penryn", SandyBridgeModel, - [FeatureSlowUAMem, FeatureSSE41, FeatureCMPXCHG16B, + [FeatureSlowUAMem16, FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; // Atom CPUs. class BonnellProc : ProcessorModel; // Legacy alias. // AMD CPUs. -def : Proc<"k6", [FeatureSlowUAMem, FeatureMMX]>; -def : Proc<"k6-2", [FeatureSlowUAMem, Feature3DNow]>; -def : Proc<"k6-3", [FeatureSlowUAMem, Feature3DNow]>; -def : Proc<"athlon", [FeatureSlowUAMem, Feature3DNowA, +def : Proc<"k6", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"k6-2", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"k6-3", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"athlon", [FeatureSlowUAMem16, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-tbird", [FeatureSlowUAMem, Feature3DNowA, +def : Proc<"athlon-tbird", [FeatureSlowUAMem16, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-4", [FeatureSlowUAMem, FeatureSSE1, Feature3DNowA, +def : Proc<"athlon-4", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-xp", [FeatureSlowUAMem, FeatureSSE1, Feature3DNowA, +def : Proc<"athlon-xp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-mp", [FeatureSlowUAMem, FeatureSSE1, Feature3DNowA, +def : Proc<"athlon-mp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8", [FeatureSlowUAMem, FeatureSSE2, Feature3DNowA, +def : Proc<"k8", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron", [FeatureSlowUAMem, FeatureSSE2, Feature3DNowA, +def : Proc<"opteron", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64", [FeatureSlowUAMem, FeatureSSE2, Feature3DNowA, +def : Proc<"athlon64", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-fx", [FeatureSlowUAMem, FeatureSSE2, Feature3DNowA, +def : Proc<"athlon-fx", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8-sse3", [FeatureSlowUAMem, FeatureSSE3, Feature3DNowA, +def : Proc<"k8-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron-sse3", [FeatureSlowUAMem, FeatureSSE3, Feature3DNowA, +def : Proc<"opteron-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64-sse3", [FeatureSlowUAMem, FeatureSSE3, Feature3DNowA, +def : Proc<"athlon64-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"amdfam10", [FeatureSSE4A, @@ -483,12 +487,12 @@ def : Proc<"bdver4", [FeatureAVX2, FeatureXOP, FeatureFMA4, FeatureTBM, FeatureFMA, FeatureSSE4A, FeatureFSGSBase]>; -def : Proc<"geode", [FeatureSlowUAMem, Feature3DNowA]>; +def : Proc<"geode", [FeatureSlowUAMem16, Feature3DNowA]>; -def : Proc<"winchip-c6", [FeatureSlowUAMem, FeatureMMX]>; -def : Proc<"winchip2", [FeatureSlowUAMem, Feature3DNow]>; -def : Proc<"c3", [FeatureSlowUAMem, Feature3DNow]>; -def : Proc<"c3-2", [FeatureSlowUAMem, FeatureSSE1]>; +def : Proc<"winchip-c6", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"winchip2", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureSSE1]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 707fa5e6c6e..bfa4145d50e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1869,7 +1869,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, if ((!IsMemset || ZeroMemset) && !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && - (!Subtarget->isUnalignedMemUnder32Slow() || + (!Subtarget->isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { if (Size >= 32) { @@ -1916,7 +1916,9 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (VT.getSizeInBits() == 256) *Fast = !Subtarget->isUnalignedMem32Slow(); else - *Fast = !Subtarget->isUnalignedMemUnder32Slow(); + // FIXME: We should always return that 8-byte and under accesses are fast. + // That is what other x86 lowering code assumes. + *Fast = !Subtarget->isUnalignedMem16Slow(); } return true; } diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 7a37d4ce926..cf9d8a8aac3 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -5511,7 +5511,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, // TODO: Check if 32-byte or greater accesses are slow too? if (!MI->hasOneMemOperand() && RC == &X86::VR128RegClass && - Subtarget.isUnalignedMemUnder32Slow()) + Subtarget.isUnalignedMem16Slow()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for // performance. @@ -5659,7 +5659,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - Subtarget.isUnalignedMemUnder32Slow()) + Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned load. return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte @@ -5704,7 +5704,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - Subtarget.isUnalignedMemUnder32Slow()) + Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned store. return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index b23b3c0e99a..5b53ca93399 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -197,7 +197,7 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { // introduced with Intel's Nehalem/Silvermont and AMD's Family10h // micro-architectures respectively. if (hasSSE42() || hasSSE4A()) - IsUAMemUnder32Slow = false; + IsUAMem16Slow = false; InstrItins = getInstrItineraryForCPU(CPUName); @@ -262,7 +262,7 @@ void X86Subtarget::initializeEnvironment() { HasMPX = false; IsBTMemSlow = false; IsSHLDSlow = false; - IsUAMemUnder32Slow = false; + IsUAMem16Slow = false; IsUAMem32Slow = false; HasSSEUnalignedMem = false; HasCmpxchg16b = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index d5d00277e5b..c5d74e66b7b 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -146,8 +146,8 @@ protected: /// True if SHLD instructions are slow. bool IsSHLDSlow; - /// True if unaligned memory accesses of 16-bytes or smaller are slow. - bool IsUAMemUnder32Slow; + /// True if unaligned memory accesses of 16-bytes are slow. + bool IsUAMem16Slow; /// True if unaligned memory accesses of 32-bytes are slow. bool IsUAMem32Slow; @@ -357,7 +357,7 @@ public: bool hasRDSEED() const { return HasRDSEED; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } - bool isUnalignedMemUnder32Slow() const { return IsUAMemUnder32Slow; } + bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } -- 2.34.1