1 ; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s
2 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
3 target triple = "x86_64-apple-macosx10.10.0"
5 ; CHECK-LABEL: fmaddsubpd_loop
6 ; CHECK: [[BODYLBL:LBB.+]]:
7 ; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
8 ; CHECK: [[INCLBL:LBB.+]]:
9 ; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
10 ; CHECK: cmpl {{%.+}}, [[INDREG]]
11 ; CHECK: jl [[BODYLBL]]
12 define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
17 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
18 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
19 %cmp = icmp slt i32 %i.0, %iter
20 br i1 %cmp, label %for.body, label %for.end
26 %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
27 %inc = add nsw i32 %i.0, 1
31 ret <4 x double> %c.addr.0
34 ; CHECK-LABEL: fmsubaddpd_loop
35 ; CHECK: [[BODYLBL:LBB.+]]:
36 ; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
37 ; CHECK: [[INCLBL:LBB.+]]:
38 ; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
39 ; CHECK: cmpl {{%.+}}, [[INDREG]]
40 ; CHECK: jl [[BODYLBL]]
41 define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
46 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
47 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
48 %cmp = icmp slt i32 %i.0, %iter
49 br i1 %cmp, label %for.body, label %for.end
55 %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
56 %inc = add nsw i32 %i.0, 1
60 ret <4 x double> %c.addr.0
63 ; CHECK-LABEL: fmaddpd_loop
64 ; CHECK: [[BODYLBL:LBB.+]]:
65 ; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
66 ; CHECK: [[INCLBL:LBB.+]]:
67 ; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
68 ; CHECK: cmpl {{%.+}}, [[INDREG]]
69 ; CHECK: jl [[BODYLBL]]
70 define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
75 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
76 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
77 %cmp = icmp slt i32 %i.0, %iter
78 br i1 %cmp, label %for.body, label %for.end
84 %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
85 %inc = add nsw i32 %i.0, 1
89 ret <4 x double> %c.addr.0
92 ; CHECK-LABEL: fmsubpd_loop
93 ; CHECK: [[BODYLBL:LBB.+]]:
94 ; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
95 ; CHECK: [[INCLBL:LBB.+]]:
96 ; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
97 ; CHECK: cmpl {{%.+}}, [[INDREG]]
98 ; CHECK: jl [[BODYLBL]]
99 define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
104 %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
105 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
106 %cmp = icmp slt i32 %i.0, %iter
107 br i1 %cmp, label %for.body, label %for.end
113 %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
114 %inc = add nsw i32 %i.0, 1
118 ret <4 x double> %c.addr.0
121 declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
122 declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
123 declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
124 declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
127 ; CHECK-LABEL: fmaddsubps_loop
128 ; CHECK: [[BODYLBL:LBB.+]]:
129 ; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
130 ; CHECK: [[INCLBL:LBB.+]]:
131 ; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
132 ; CHECK: cmpl {{%.+}}, [[INDREG]]
133 ; CHECK: jl [[BODYLBL]]
134 define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
139 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
140 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
141 %cmp = icmp slt i32 %i.0, %iter
142 br i1 %cmp, label %for.body, label %for.end
148 %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
149 %inc = add nsw i32 %i.0, 1
153 ret <8 x float> %c.addr.0
156 ; CHECK-LABEL: fmsubaddps_loop
157 ; CHECK: [[BODYLBL:LBB.+]]:
158 ; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
159 ; CHECK: [[INCLBL:LBB.+]]:
160 ; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
161 ; CHECK: cmpl {{%.+}}, [[INDREG]]
162 ; CHECK: jl [[BODYLBL]]
163 define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
168 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
169 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
170 %cmp = icmp slt i32 %i.0, %iter
171 br i1 %cmp, label %for.body, label %for.end
177 %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
178 %inc = add nsw i32 %i.0, 1
182 ret <8 x float> %c.addr.0
185 ; CHECK-LABEL: fmaddps_loop
186 ; CHECK: [[BODYLBL:LBB.+]]:
187 ; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
188 ; CHECK: [[INCLBL:LBB.+]]:
189 ; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
190 ; CHECK: cmpl {{%.+}}, [[INDREG]]
191 ; CHECK: jl [[BODYLBL]]
192 define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
197 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
198 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
199 %cmp = icmp slt i32 %i.0, %iter
200 br i1 %cmp, label %for.body, label %for.end
206 %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
207 %inc = add nsw i32 %i.0, 1
211 ret <8 x float> %c.addr.0
214 ; CHECK-LABEL: fmsubps_loop
215 ; CHECK: [[BODYLBL:LBB.+]]:
216 ; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
217 ; CHECK: [[INCLBL:LBB.+]]:
218 ; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
219 ; CHECK: cmpl {{%.+}}, [[INDREG]]
220 ; CHECK: jl [[BODYLBL]]
221 define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
226 %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
227 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
228 %cmp = icmp slt i32 %i.0, %iter
229 br i1 %cmp, label %for.body, label %for.end
235 %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
236 %inc = add nsw i32 %i.0, 1
240 ret <8 x float> %c.addr.0
243 declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
244 declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
245 declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
246 declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)