ARM: Improve pattern for isel mul of vector by scalar.

author Jim Grosbach <grosbach@apple.com>

Thu, 29 Aug 2013 22:41:46 +0000 (22:41 +0000)

committer Jim Grosbach <grosbach@apple.com>

Thu, 29 Aug 2013 22:41:46 +0000 (22:41 +0000)
author Jim Grosbach <grosbach@apple.com>
Thu, 29 Aug 2013 22:41:46 +0000 (22:41 +0000)
committer Jim Grosbach <grosbach@apple.com>
Thu, 29 Aug 2013 22:41:46 +0000 (22:41 +0000)
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td

index 49ae3348cd68dfcc601b3073cc316efbd0fe359a..f1bd37ea52695a878158c1e92c9b72bb0786af4c 100644 (file)
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -4022,6 +4022,17 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
                                     (DSubReg_i32_reg imm:$lane))),
                             (SubReg_i32_lane imm:$lane)))>;
  
+
+def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfd DPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfq QPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+
+
  //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
  defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
                            IIC_VMULi16Q, IIC_VMULi32Q,
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll

index 6210ad3695d1f55f2466511927457f654061bb9f..5e5e99bc2f96207a7d500eea39ac3e4a1bd61cfc 100644 (file)
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -623,3 +623,21 @@ entry:
    store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4
    ret void
  }
+
+define void @foo(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
+;   Look for doing a normal scalar FP load rather than an to-all-lanes load.
+;   e.g., "ldr s0, [r2]" rathern than "vld1.32  {d18[], d19[]}, [r2:32]"
+;   Then check that the vector multiply has folded the splat to all lanes
+;   and used a vector * scalar instruction.
+; CHECK: vldr  {{s[0-9]+}}, [r2]
+; CHECK: vmul.f32  q8, q8, d0[0]
+  %tmp = load float* %src, align 4
+  %tmp5 = load <4 x float>* %a, align 4
+  %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1
+  %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2
+  %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3
+  %tmp10 = fmul <4 x float> %tmp9, %tmp5
+  store <4 x float> %tmp10, <4 x float>* %dst, align 4
+  ret void
+}
author	Jim Grosbach <grosbach@apple.com>
	Thu, 29 Aug 2013 22:41:46 +0000 (22:41 +0000)
committer	Jim Grosbach <grosbach@apple.com>
	Thu, 29 Aug 2013 22:41:46 +0000 (22:41 +0000)
lib/Target/ARM/ARMInstrNEON.td		patch \| blob \| history
test/CodeGen/ARM/vmul.ll		patch \| blob \| history