[Feature64Bit]>;
def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
"Bit testing of memory is slow">;
+def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+ "SHLD instruction is slow">;
def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
"IsUAMemFast", "true",
"Fast unaligned memory access">;
def : Proc<"k6", [FeatureMMX]>;
def : Proc<"k6-2", [Feature3DNow]>;
def : Proc<"k6-3", [Feature3DNow]>;
-def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
+def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit,
- FeatureSlowBTMem]>;
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit,
- FeatureSlowBTMem]>;
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit,
- FeatureSlowBTMem]>;
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit,
- FeatureSlowBTMem]>;
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
- FeatureSlowBTMem]>;
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
- FeatureSlowBTMem]>;
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
- FeatureSlowBTMem]>;
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
def : Proc<"amdfam10", [FeatureSSE4A,
Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
- FeaturePOPCNT, FeatureSlowBTMem]>;
+ FeaturePOPCNT, FeatureSlowBTMem,
+ FeatureSlowSHLD]>;
// Bobcat
def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
- FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT]>;
+ FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT,
+ FeatureSlowSHLD]>;
// Jaguar
def : Proc<"btver2", [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
FeatureBMI, FeatureF16C, FeatureMOVBE,
- FeatureLZCNT, FeaturePOPCNT]>;
+ FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
// Bulldozer
def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
- FeatureLZCNT, FeaturePOPCNT]>;
+ FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
// Piledriver
def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
FeatureF16C, FeatureLZCNT,
- FeaturePOPCNT, FeatureBMI, FeatureTBM,
- FeatureFMA]>;
+ FeaturePOPCNT, FeatureBMI, FeatureTBM,
+ FeatureFMA, FeatureSlowSHLD]>;
// Steamroller
def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool OptForSize = MF.getFunction()->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+ // SHLD/SHRD instructions have lower register pressure, but on some
+ // platforms they have higher latency than the equivalent
+ // series of shifts/or that would otherwise be generated.
+ // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
+ // have higer latencies and we are not optimizing for size.
+ if (!OptForSize && Subtarget->isSHLDSlow())
+ return SDValue();
+
if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
std::swap(N0, N1);
if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
ToggleFeature(X86::FeatureSlowBTMem);
}
+ // Determine if SHLD/SHRD instructions have higher latency then the
+ // equivalent series of shifts/or instructions.
+ // FIXME: Add Intel's processors that have SHLD instructions with very
+ // poor latency.
+ if (IsAMD) {
+ IsSHLDSlow = true;
+ ToggleFeature(X86::FeatureSlowSHLD);
+ }
+
// If it's an Intel chip since Nehalem and not an Atom chip, unaligned
// memory access is fast. We hard code model numbers here because they
// aren't strictly increasing for Intel chips it seems.
HasPRFCHW = false;
HasRDSEED = false;
IsBTMemSlow = false;
+ IsSHLDSlow = false;
IsUAMemFast = false;
HasVectorUAMem = false;
HasCmpxchg16b = false;
/// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
bool IsBTMemSlow;
+ /// IsSHLDSlow - True if SHLD instructions are slow.
+ bool IsSHLDSlow;
+
/// IsUAMemFast - True if unaligned memory access is fast.
bool IsUAMemFast;
bool hasPRFCHW() const { return HasPRFCHW; }
bool hasRDSEED() const { return HasRDSEED; }
bool isBTMemSlow() const { return IsBTMemSlow; }
+ bool isSHLDSlow() const { return IsSHLDSlow; }
bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
bool hasVectorUAMem() const { return HasVectorUAMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
--- /dev/null
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; Verify that for the architectures that are known to have poor latency
+; double precision shift instructions we generate alternative sequence
+; of instructions with lower latencies instead of shld instruction.
+
+;uint64_t lshift1(uint64_t a, uint64_t b)
+;{
+; return (a << 1) | (b >> 63);
+;}
+
+; CHECK: lshift1:
+; CHECK: addq {{.*}},{{.*}}
+; CHECK-NEXT: shrq $63, {{.*}}
+; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+ %shl = shl i64 %a, 1
+ %shr = lshr i64 %b, 63
+ %or = or i64 %shr, %shl
+ ret i64 %or
+}
+
+;uint64_t lshift2(uint64_t a, uint64_t b)
+;{
+; return (a << 2) | (b >> 62);
+;}
+
+; CHECK: lshift2:
+; CHECK: shlq $2, {{.*}}
+; CHECK-NEXT: shrq $62, {{.*}}
+; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+ %shl = shl i64 %a, 2
+ %shr = lshr i64 %b, 62
+ %or = or i64 %shr, %shl
+ ret i64 %or
+}
+
+;uint64_t lshift7(uint64_t a, uint64_t b)
+;{
+; return (a << 7) | (b >> 57);
+;}
+
+; CHECK: lshift7:
+; CHECK: shlq $7, {{.*}}
+; CHECK-NEXT: shrq $57, {{.*}}
+; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift7(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+ %shl = shl i64 %a, 7
+ %shr = lshr i64 %b, 57
+ %or = or i64 %shr, %shl
+ ret i64 %or
+}
+
+;uint64_t lshift63(uint64_t a, uint64_t b)
+;{
+; return (a << 63) | (b >> 1);
+;}
+
+; CHECK: lshift63:
+; CHECK: shlq $63, {{.*}}
+; CHECK-NEXT: shrq {{.*}}
+; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift63(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+ %shl = shl i64 %a, 63
+ %shr = lshr i64 %b, 1
+ %or = or i64 %shr, %shl
+ ret i64 %or
+}
--- /dev/null
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; Verify that for the architectures that are known to have poor latency
+; double precision shift instructions we generate alternative sequence
+; of instructions with lower latencies instead of shrd instruction.
+
+;uint64_t rshift1(uint64_t a, uint64_t b)
+;{
+; return (a >> 1) | (b << 63);
+;}
+
+; CHECK: rshift1:
+; CHECK: shrq {{.*}}
+; CHECK-NEXT: shlq $63, {{.*}}
+; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
+
+define i64 @rshift1(i64 %a, i64 %b) nounwind readnone uwtable {
+ %1 = lshr i64 %a, 1
+ %2 = shl i64 %b, 63
+ %3 = or i64 %2, %1
+ ret i64 %3
+}
+
+;uint64_t rshift2(uint64_t a, uint64_t b)
+;{
+; return (a >> 2) | (b << 62);
+;}
+
+; CHECK: rshift2:
+; CHECK: shrq $2, {{.*}}
+; CHECK-NEXT: shlq $62, {{.*}}
+; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @rshift2(i64 %a, i64 %b) nounwind readnone uwtable {
+ %1 = lshr i64 %a, 2
+ %2 = shl i64 %b, 62
+ %3 = or i64 %2, %1
+ ret i64 %3
+}
+
+;uint64_t rshift7(uint64_t a, uint64_t b)
+;{
+; return (a >> 7) | (b << 57);
+;}
+
+; CHECK: rshift7:
+; CHECK: shrq $7, {{.*}}
+; CHECK-NEXT: shlq $57, {{.*}}
+; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @rshift7(i64 %a, i64 %b) nounwind readnone uwtable {
+ %1 = lshr i64 %a, 7
+ %2 = shl i64 %b, 57
+ %3 = or i64 %2, %1
+ ret i64 %3
+}
+
+;uint64_t rshift63(uint64_t a, uint64_t b)
+;{
+; return (a >> 63) | (b << 1);
+;}
+
+; CHECK: rshift63:
+; CHECK: shrq $63, {{.*}}
+; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}}
+; CHECK-NEXT: orq {{.*}}, {{.*}}
+
+define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable {
+ %1 = lshr i64 %a, 63
+ %2 = shl i64 %b, 1
+ %3 = or i64 %2, %1
+ ret i64 %3
+}
--- /dev/null
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+
+; clang -Oz -c test1.cpp -emit-llvm -S -o
+; Verify that we generate shld insruction when we are optimizing for size,
+; even for X86_64 processors that are known to have poor latency double
+; precision shift instuctions.
+; uint64_t lshift10(uint64_t a, uint64_t b)
+; {
+; return (a << 10) | (b >> 54);
+; }
+
+; Function Attrs: minsize nounwind optsize readnone uwtable
+define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 {
+entry:
+; CHECK: shldq $10
+ %shl = shl i64 %a, 10
+ %shr = lshr i64 %b, 54
+ %or = or i64 %shr, %shl
+ ret i64 %or
+}
+
+attributes #0 = { minsize nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
+; clang -Os -c test2.cpp -emit-llvm -S
+; Verify that we generate shld insruction when we are optimizing for size,
+; even for X86_64 processors that are known to have poor latency double
+; precision shift instuctions.
+; uint64_t lshift11(uint64_t a, uint64_t b)
+; {
+; return (a << 11) | (b >> 53);
+; }
+
+; Function Attrs: nounwind optsize readnone uwtable
+define i64 @_Z8lshift11mm(i64 %a, i64 %b) #1 {
+entry:
+; CHECK: shldq $11
+ %shl = shl i64 %a, 11
+ %shr = lshr i64 %b, 53
+ %or = or i64 %shr, %shl
+ ret i64 %or
+}
+
+attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; clang -O2 -c test2.cpp -emit-llvm -S
+; Verify that we do not generate shld insruction when we are not optimizing
+; for size for X86_64 processors that are known to have poor latency double
+; precision shift instuctions.
+; uint64_t lshift12(uint64_t a, uint64_t b)
+; {
+; return (a << 12) | (b >> 52);
+; }
+
+; Function Attrs: nounwind optsize readnone uwtable
+define i64 @_Z8lshift12mm(i64 %a, i64 %b) #2 {
+entry:
+; CHECK: shlq $12
+; CHECK-NEXT: shrq $52
+ %shl = shl i64 %a, 12
+ %shr = lshr i64 %b, 52
+ %or = or i64 %shr, %shl
+ ret i64 %or
+}
+
+attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
--- /dev/null
+; RUN: llc < %s -march=x86-64 -mcpu=athlon | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-tbird | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-4 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-xp | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-mp | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=k8 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=opteron | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-fx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=k8-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=opteron-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon64-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=amdfam10 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=btver1 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=btver2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=bdver2 | FileCheck %s
+
+; Verify that for the X86_64 processors that are known to have poor latency
+; double precision shift instructions we do not generate 'shld' or 'shrd'
+; instructions.
+
+;uint64_t lshift(uint64_t a, uint64_t b, int c)
+;{
+; return (a << c) | (b >> (64-c));
+;}
+
+define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
+entry:
+; CHECK-NOT: shld
+ %sh_prom = zext i32 %c to i64
+ %shl = shl i64 %a, %sh_prom
+ %sub = sub nsw i32 64, %c
+ %sh_prom1 = zext i32 %sub to i64
+ %shr = lshr i64 %b, %sh_prom1
+ %or = or i64 %shr, %shl
+ ret i64 %or
+}
+
+;uint64_t rshift(uint64_t a, uint64_t b, int c)
+;{
+; return (a >> c) | (b << (64-c));
+;}
+
+define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
+entry:
+; CHECK-NOT: shrd
+ %sh_prom = zext i32 %c to i64
+ %shr = lshr i64 %a, %sh_prom
+ %sub = sub nsw i32 64, %c
+ %sh_prom1 = zext i32 %sub to i64
+ %shl = shl i64 %b, %sh_prom1
+ %or = or i64 %shl, %shr
+ ret i64 %or
+}
+
+