From: Evan Cheng Date: Fri, 26 Sep 2008 23:41:32 +0000 (+0000) Subject: Implement "punpckldq %xmm0, $xmm0" as "pshufd $0x50, %xmm0, %xmm" unless optimizing... X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=b7a75a5a541a836d07cfc5ccebd70bb5e42f797f;p=oota-llvm.git Implement "punpckldq %xmm0, $xmm0" as "pshufd $0x50, %xmm0, %xmm" unless optimizing for code size. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@56711 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 56bffaba31c..b5c0c35d12e 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" @@ -130,12 +131,17 @@ namespace { /// MachineBasicBlock *CurBB; + /// OptForSize - If true, selector should try to optimize for code size + /// instead of performance. + bool OptForSize; + public: X86DAGToDAGISel(X86TargetMachine &tm, bool fast) : SelectionDAGISel(X86Lowering, fast), ContainsFPCode(false), TM(tm), X86Lowering(*TM.getTargetLowering()), - Subtarget(&TM.getSubtarget()) {} + Subtarget(&TM.getSubtarget()), + OptForSize(OptimizeForSize) {} virtual bool runOnFunction(Function &Fn) { // Make sure we re-emit a set of the global base reg if necessary @@ -650,6 +656,10 @@ void X86DAGToDAGISel::PreprocessForFPConvert() { /// when it has created a SelectionDAG for us to codegen. void X86DAGToDAGISel::InstructionSelect() { CurBB = BB; // BB can change as result of isel. + if (!OptForSize) { + const Function *F = CurDAG->getMachineFunction().getFunction(); + OptForSize = !F->isDeclaration() && F->hasNote(Attribute::OptimizeForSize); + } DEBUG(BB->dump()); if (!Fast) diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index bd61c3aed46..11ea1c06db5 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -186,6 +186,7 @@ def In64BitMode : Predicate<"Subtarget->is64Bit()">; def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; def NotSmallCode : Predicate<"TM.getCodeModel() != CodeModel::Small">; def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; +def OptForSpeed : Predicate<"!OptForSize">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index af7694a1ee6..8b4579517e2 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -744,7 +744,7 @@ def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), addr:$dst)]>; let Constraints = "$src1 = $dst" in { -let AddedComplexity = 15 in { +let AddedComplexity = 20 in { def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movlhps\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, @@ -759,7 +759,7 @@ def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128: } // AddedComplexity } // Constraints = "$src1 = $dst" -let AddedComplexity = 15 in +let AddedComplexity = 20 in def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef), MOVDDUP_shuffle_mask)), (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>; @@ -2921,6 +2921,7 @@ def : Pat<(vector_shuffle (bc_v4i32 (memopv4f32 addr:$src1)), (undef), SHUFP_unary_shuffle_mask:$sm), (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>, Requires<[HasSSE2]>; + // Special binary v4i32 shuffle cases with SHUFPS. def : Pat<(v4i32 (vector_shuffle VR128:$src1, (v4i32 VR128:$src2), PSHUFD_binary_shuffle_mask:$sm)), @@ -2937,11 +2938,21 @@ def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2, Requires<[HasSSE2]>; // Special unary SHUFPDrri case. def : Pat<(v2i64 (vector_shuffle VR128:$src1, (undef), - SHUFP_unary_shuffle_mask:$sm)), + SHUFP_unary_shuffle_mask:$sm)), (SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>, Requires<[HasSSE2]>; // vector_shuffle v1, , <0, 0, 1, 1, ...> +let AddedComplexity = 15 in { +def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), + UNPCKL_v_undef_shuffle_mask:$sm)), + (PSHUFDri VR128:$src, PSHUFD_shuffle_mask:$sm)>, + Requires<[OptForSpeed, HasSSE2]>; +def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef), + UNPCKL_v_undef_shuffle_mask:$sm)), + (PSHUFDri VR128:$src, PSHUFD_shuffle_mask:$sm)>, + Requires<[OptForSpeed, HasSSE2]>; +} let AddedComplexity = 10 in { def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef), UNPCKL_v_undef_shuffle_mask)), @@ -2958,6 +2969,16 @@ def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), } // vector_shuffle v1, , <2, 2, 3, 3, ...> +let AddedComplexity = 15 in { +def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), + UNPCKH_v_undef_shuffle_mask:$sm)), + (PSHUFDri VR128:$src, PSHUFD_shuffle_mask:$sm)>, + Requires<[OptForSpeed, HasSSE2]>; +def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef), + UNPCKH_v_undef_shuffle_mask:$sm)), + (PSHUFDri VR128:$src, PSHUFD_shuffle_mask:$sm)>, + Requires<[OptForSpeed, HasSSE2]>; +} let AddedComplexity = 10 in { def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef), UNPCKH_v_undef_shuffle_mask)), @@ -2973,7 +2994,7 @@ def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; } -let AddedComplexity = 15 in { +let AddedComplexity = 20 in { // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, MOVHP_shuffle_mask)), diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll index eddba87ba04..b45f9398e4f 100644 --- a/test/CodeGen/X86/vec_extract.ll +++ b/test/CodeGen/X86/vec_extract.ll @@ -4,7 +4,7 @@ ; RUN: grep pshufd %t | count 1 ; RUN: grep unpckhpd %t | count 1 -define void @test1(<4 x float>* %F, float* %f) { +define void @test1(<4 x float>* %F, float* %f) nounwind { %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=2] %tmp7 = add <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1] %tmp2 = extractelement <4 x float> %tmp7, i32 0 ; [#uses=1] @@ -12,21 +12,21 @@ define void @test1(<4 x float>* %F, float* %f) { ret void } -define float @test2(<4 x float>* %F, float* %f) { +define float @test2(<4 x float>* %F, float* %f) nounwind { %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=2] %tmp7 = add <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1] %tmp2 = extractelement <4 x float> %tmp7, i32 2 ; [#uses=1] ret float %tmp2 } -define void @test3(float* %R, <4 x float>* %P1) { +define void @test3(float* %R, <4 x float>* %P1) nounwind { %X = load <4 x float>* %P1 ; <<4 x float>> [#uses=1] %tmp = extractelement <4 x float> %X, i32 3 ; [#uses=1] store float %tmp, float* %R ret void } -define double @test4(double %A) { +define double @test4(double %A) nounwind { %tmp1 = call <2 x double> @foo( ) ; <<2 x double>> [#uses=1] %tmp2 = extractelement <2 x double> %tmp1, i32 1 ; [#uses=1] %tmp3 = add double %tmp2, %A ; [#uses=1] diff --git a/test/CodeGen/X86/vec_shuffle-23.ll b/test/CodeGen/X86/vec_shuffle-23.ll new file mode 100644 index 00000000000..34d84ef15f9 --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-23.ll @@ -0,0 +1,19 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep punpck +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufd +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -optimize-size | grep punpck + +define i32 @t() nounwind { +entry: + %a = alloca <4 x i32> ; <<4 x i32>*> [#uses=2] + %b = alloca <4 x i32> ; <<4 x i32>*> [#uses=5] + volatile store <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a + %tmp = load <4 x i32>* %a ; <<4 x i32>> [#uses=1] + store <4 x i32> %tmp, <4 x i32>* %b + %tmp1 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] + %tmp2 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] + %punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x i32>> [#uses=1] + store <4 x i32> %punpckldq, <4 x i32>* %b + %tmp3 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] + %result = extractelement <4 x i32> %tmp3, i32 0 ; [#uses=1] + ret i32 %result +}