From: Andrea Di Biagio Date: Thu, 12 Dec 2013 11:50:47 +0000 (+0000) Subject: Added new X86 patterns to select SSE scalar fp arithmetic instructions from X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=a29b054e7a88fefb17b099a2e7727897f8b3743a;p=oota-llvm.git Added new X86 patterns to select SSE scalar fp arithmetic instructions from a vector packed single/double fp operation followed by a vector insert. The effect is that the backend coverts the packed fp instruction followed by a vectro insert into a SSE or AVX scalar fp instruction. For example, given the following code: __m128 foo(__m128 A, __m128 B) { __m128 C = A + B; return (__m128) {c[0], a[1], a[2], a[3]}; } previously we generated: addps %xmm0, %xmm1 movss %xmm1, %xmm0 we now generate: addss %xmm1, %xmm0 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@197145 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 08b547cf6d7..bbf86cdd025 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3142,6 +3142,89 @@ let AddedComplexity = 20, Predicates = [HasAVX] in { (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; } +// Patterns used to select SSE scalar fp arithmetic instructions from +// a vector packed single/double fp operation followed by a vector insert. +// +// The effect is that the backend converts the packed fp instruction +// followed by a vector insert into a single SSE scalar fp instruction. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// __m128 C = A + B; +// return (__m128) {c[0], a[1], a[2], a[3]}; +// } +// +// previously we generated: +// addps %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// we now generate: +// addss %xmm1, %xmm0 + +def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; +def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; +def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (MULSSrr_Int v4f32:$dst, v4f32:$src)>; +def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; + +let Predicates = [HasSSE2] in { + // SSE2 patterns to select scalar double-precision fp arithmetic instructions + // from a packed double-precision fp instruction plus movsd. + + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; +} + +let AddedComplexity = 20, Predicates = [HasAVX] in { + // The following patterns select AVX Scalar single/double precision fp + // arithmetic instructions from a packed single precision fp instruction + // plus movss/movsd. + // The 'AddedComplexity' is required to give them higher priority over + // the equivalent SSE/SSE2 patterns. + + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; +} + /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to /// represent the associated intrinsic operation. This form is unlike the diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll new file mode 100644 index 00000000000..59685993f5d --- /dev/null +++ b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll @@ -0,0 +1,215 @@ +; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s +; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s +; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s + +; Ensure that the backend selects SSE/AVX scalar fp instructions +; from a packed fp instrution plus a vector insert. + + +define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { + %1 = fadd <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} + +; CHECK-LABEL: test_add_ss +; SSE2: addss %xmm1, %xmm0 +; AVX: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { + %1 = fsub <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} + +; CHECK-LABEL: test_sub_ss +; SSE2: subss %xmm1, %xmm0 +; AVX: vsubss %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { + %1 = fmul <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} + +; CHECK-LABEL: test_mul_ss +; SSE2: mulss %xmm1, %xmm0 +; AVX: vmulss %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { + %1 = fdiv <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} + +; CHECK-LABEL: test_div_ss +; SSE2: divss %xmm1, %xmm0 +; AVX: vdivss %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { + %1 = fadd <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> + ret <2 x double> %2 +} + +; CHECK-LABEL: test_add_sd +; SSE2: addsd %xmm1, %xmm0 +; AVX: vaddsd %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { + %1 = fsub <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> + ret <2 x double> %2 +} + +; CHECK-LABEL: test_sub_sd +; SSE2: subsd %xmm1, %xmm0 +; AVX: vsubsd %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { + %1 = fmul <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> + ret <2 x double> %2 +} + +; CHECK-LABEL: test_mul_sd +; SSE2: mulsd %xmm1, %xmm0 +; AVX: vmulsd %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { + %1 = fdiv <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> + ret <2 x double> %2 +} + +; CHECK-LABEL: test_div_sd +; SSE2: divsd %xmm1, %xmm0 +; AVX: vdivsd %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) { + %1 = fadd <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +; CHECK-LABEL: test2_add_ss +; SSE2: addss %xmm0, %xmm1 +; AVX: vaddss %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) { + %1 = fsub <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +; CHECK-LABEL: test2_sub_ss +; SSE2: subss %xmm0, %xmm1 +; AVX: vsubss %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) { + %1 = fmul <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +; CHECK-LABEL: test2_mul_ss +; SSE2: mulss %xmm0, %xmm1 +; AVX: vmulss %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) { + %1 = fdiv <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +; CHECK-LABEL: test2_div_ss +; SSE2: divss %xmm0, %xmm1 +; AVX: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) { + %1 = fadd <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> + ret <2 x double> %2 +} + +; CHECK-LABEL: test2_add_sd +; SSE2: addsd %xmm0, %xmm1 +; AVX: vaddsd %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) { + %1 = fsub <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> + ret <2 x double> %2 +} + +; CHECK-LABEL: test2_sub_sd +; SSE2: subsd %xmm0, %xmm1 +; AVX: vsubsd %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) { + %1 = fmul <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> + ret <2 x double> %2 +} + +; CHECK-LABEL: test2_mul_sd +; SSE2: mulsd %xmm0, %xmm1 +; AVX: vmulsd %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) { + %1 = fdiv <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> + ret <2 x double> %2 +} + +; CHECK-LABEL: test2_div_sd +; SSE2: divsd %xmm0, %xmm1 +; AVX: vdivsd %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret +