brcm2708: update linux 4.4 patches to latest version
[lede.git] / target / linux / brcm2708 / patches-4.4 / 0294-drm-vc4-improve-throughput-by-pipelining-binning-and.patch
1 From a2c21b04f340f594c16f9c7235ec7b7f78a96a1f Mon Sep 17 00:00:00 2001
2 From: Varad Gautam <varadgautam@gmail.com>
3 Date: Wed, 17 Feb 2016 19:08:21 +0530
4 Subject: [PATCH 294/304] drm/vc4: improve throughput by pipelining binning and
5  rendering jobs
6
7 The hardware provides us with separate threads for binning and
8 rendering, and the existing model waits for them both to complete
9 before submitting the next job.
10
11 Splitting the binning and rendering submissions reduces idle time and
12 gives us approx 20-30% speedup with some x11perf tests such as -line10
13 and -tilerect1.  Improves openarena performance by 1.01897% +/-
14 0.247857% (n=16).
15
16 Thanks to anholt for suggesting this.
17
18 v2: Rebase on the spurious resets fix (change by anholt).
19
20 Signed-off-by: Varad Gautam <varadgautam@gmail.com>
21 Reviewed-by: Eric Anholt <eric@anholt.net>
22 Signed-off-by: Eric Anholt <eric@anholt.net>
23 (cherry picked from commit ca26d28bbaa39f31d5e7e4812603b015c8d54207)
24 ---
25  drivers/gpu/drm/vc4/vc4_drv.h |  37 +++++++++----
26  drivers/gpu/drm/vc4/vc4_gem.c | 123 ++++++++++++++++++++++++++++++------------
27  drivers/gpu/drm/vc4/vc4_irq.c |  58 ++++++++++++++++----
28  3 files changed, 166 insertions(+), 52 deletions(-)
29
30 --- a/drivers/gpu/drm/vc4/vc4_drv.h
31 +++ b/drivers/gpu/drm/vc4/vc4_drv.h
32 @@ -53,7 +53,7 @@ struct vc4_dev {
33         /* Protects bo_cache and the BO stats. */
34         struct mutex bo_lock;
35  
36 -       /* Sequence number for the last job queued in job_list.
37 +       /* Sequence number for the last job queued in bin_job_list.
38          * Starts at 0 (no jobs emitted).
39          */
40         uint64_t emit_seqno;
41 @@ -63,11 +63,19 @@ struct vc4_dev {
42          */
43         uint64_t finished_seqno;
44  
45 -       /* List of all struct vc4_exec_info for jobs to be executed.
46 -        * The first job in the list is the one currently programmed
47 -        * into ct0ca/ct1ca for execution.
48 +       /* List of all struct vc4_exec_info for jobs to be executed in
49 +        * the binner.  The first job in the list is the one currently
50 +        * programmed into ct0ca for execution.
51 +        */
52 +       struct list_head bin_job_list;
53 +
54 +       /* List of all struct vc4_exec_info for jobs that have
55 +        * completed binning and are ready for rendering.  The first
56 +        * job in the list is the one currently programmed into ct1ca
57 +        * for execution.
58          */
59 -       struct list_head job_list;
60 +       struct list_head render_job_list;
61 +
62         /* List of the finished vc4_exec_infos waiting to be freed by
63          * job_done_work.
64          */
65 @@ -291,11 +299,20 @@ struct vc4_exec_info {
66  };
67  
68  static inline struct vc4_exec_info *
69 -vc4_first_job(struct vc4_dev *vc4)
70 +vc4_first_bin_job(struct vc4_dev *vc4)
71 +{
72 +       if (list_empty(&vc4->bin_job_list))
73 +               return NULL;
74 +       return list_first_entry(&vc4->bin_job_list, struct vc4_exec_info, head);
75 +}
76 +
77 +static inline struct vc4_exec_info *
78 +vc4_first_render_job(struct vc4_dev *vc4)
79  {
80 -       if (list_empty(&vc4->job_list))
81 +       if (list_empty(&vc4->render_job_list))
82                 return NULL;
83 -       return list_first_entry(&vc4->job_list, struct vc4_exec_info, head);
84 +       return list_first_entry(&vc4->render_job_list,
85 +                               struct vc4_exec_info, head);
86  }
87  
88  /**
89 @@ -410,7 +427,9 @@ int vc4_wait_seqno_ioctl(struct drm_devi
90                          struct drm_file *file_priv);
91  int vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
92                       struct drm_file *file_priv);
93 -void vc4_submit_next_job(struct drm_device *dev);
94 +void vc4_submit_next_bin_job(struct drm_device *dev);
95 +void vc4_submit_next_render_job(struct drm_device *dev);
96 +void vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec);
97  int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno,
98                        uint64_t timeout_ns, bool interruptible);
99  void vc4_job_handle_completed(struct vc4_dev *vc4);
100 --- a/drivers/gpu/drm/vc4/vc4_gem.c
101 +++ b/drivers/gpu/drm/vc4/vc4_gem.c
102 @@ -154,10 +154,10 @@ vc4_save_hang_state(struct drm_device *d
103         struct vc4_dev *vc4 = to_vc4_dev(dev);
104         struct drm_vc4_get_hang_state *state;
105         struct vc4_hang_state *kernel_state;
106 -       struct vc4_exec_info *exec;
107 +       struct vc4_exec_info *exec[2];
108         struct vc4_bo *bo;
109         unsigned long irqflags;
110 -       unsigned int i, unref_list_count;
111 +       unsigned int i, j, unref_list_count, prev_idx;
112  
113         kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL);
114         if (!kernel_state)
115 @@ -166,37 +166,55 @@ vc4_save_hang_state(struct drm_device *d
116         state = &kernel_state->user_state;
117  
118         spin_lock_irqsave(&vc4->job_lock, irqflags);
119 -       exec = vc4_first_job(vc4);
120 -       if (!exec) {
121 +       exec[0] = vc4_first_bin_job(vc4);
122 +       exec[1] = vc4_first_render_job(vc4);
123 +       if (!exec[0] && !exec[1]) {
124                 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
125                 return;
126         }
127  
128 -       unref_list_count = 0;
129 -       list_for_each_entry(bo, &exec->unref_list, unref_head)
130 -               unref_list_count++;
131 -
132 -       state->bo_count = exec->bo_count + unref_list_count;
133 -       kernel_state->bo = kcalloc(state->bo_count, sizeof(*kernel_state->bo),
134 -                                  GFP_ATOMIC);
135 +       /* Get the bos from both binner and renderer into hang state. */
136 +       state->bo_count = 0;
137 +       for (i = 0; i < 2; i++) {
138 +               if (!exec[i])
139 +                       continue;
140 +
141 +               unref_list_count = 0;
142 +               list_for_each_entry(bo, &exec[i]->unref_list, unref_head)
143 +                       unref_list_count++;
144 +               state->bo_count += exec[i]->bo_count + unref_list_count;
145 +       }
146 +
147 +       kernel_state->bo = kcalloc(state->bo_count,
148 +                                  sizeof(*kernel_state->bo), GFP_ATOMIC);
149 +
150         if (!kernel_state->bo) {
151                 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
152                 return;
153         }
154  
155 -       for (i = 0; i < exec->bo_count; i++) {
156 -               drm_gem_object_reference(&exec->bo[i]->base);
157 -               kernel_state->bo[i] = &exec->bo[i]->base;
158 -       }
159 +       prev_idx = 0;
160 +       for (i = 0; i < 2; i++) {
161 +               if (!exec[i])
162 +                       continue;
163 +
164 +               for (j = 0; j < exec[i]->bo_count; j++) {
165 +                       drm_gem_object_reference(&exec[i]->bo[j]->base);
166 +                       kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base;
167 +               }
168  
169 -       list_for_each_entry(bo, &exec->unref_list, unref_head) {
170 -               drm_gem_object_reference(&bo->base.base);
171 -               kernel_state->bo[i] = &bo->base.base;
172 -               i++;
173 +               list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
174 +                       drm_gem_object_reference(&bo->base.base);
175 +                       kernel_state->bo[j + prev_idx] = &bo->base.base;
176 +                       j++;
177 +               }
178 +               prev_idx = j + 1;
179         }
180  
181 -       state->start_bin = exec->ct0ca;
182 -       state->start_render = exec->ct1ca;
183 +       if (exec[0])
184 +               state->start_bin = exec[0]->ct0ca;
185 +       if (exec[1])
186 +               state->start_render = exec[1]->ct1ca;
187  
188         spin_unlock_irqrestore(&vc4->job_lock, irqflags);
189  
190 @@ -272,13 +290,15 @@ vc4_hangcheck_elapsed(unsigned long data
191         struct vc4_dev *vc4 = to_vc4_dev(dev);
192         uint32_t ct0ca, ct1ca;
193         unsigned long irqflags;
194 -       struct vc4_exec_info *exec;
195 +       struct vc4_exec_info *bin_exec, *render_exec;
196  
197         spin_lock_irqsave(&vc4->job_lock, irqflags);
198 -       exec = vc4_first_job(vc4);
199 +
200 +       bin_exec = vc4_first_bin_job(vc4);
201 +       render_exec = vc4_first_render_job(vc4);
202  
203         /* If idle, we can stop watching for hangs. */
204 -       if (!exec) {
205 +       if (!bin_exec && !render_exec) {
206                 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
207                 return;
208         }
209 @@ -289,9 +309,12 @@ vc4_hangcheck_elapsed(unsigned long data
210         /* If we've made any progress in execution, rearm the timer
211          * and wait.
212          */
213 -       if (ct0ca != exec->last_ct0ca || ct1ca != exec->last_ct1ca) {
214 -               exec->last_ct0ca = ct0ca;
215 -               exec->last_ct1ca = ct1ca;
216 +       if ((bin_exec && ct0ca != bin_exec->last_ct0ca) ||
217 +           (render_exec && ct1ca != render_exec->last_ct1ca)) {
218 +               if (bin_exec)
219 +                       bin_exec->last_ct0ca = ct0ca;
220 +               if (render_exec)
221 +                       render_exec->last_ct1ca = ct1ca;
222                 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
223                 vc4_queue_hangcheck(dev);
224                 return;
225 @@ -391,11 +414,13 @@ vc4_flush_caches(struct drm_device *dev)
226   * The job_lock should be held during this.
227   */
228  void
229 -vc4_submit_next_job(struct drm_device *dev)
230 +vc4_submit_next_bin_job(struct drm_device *dev)
231  {
232         struct vc4_dev *vc4 = to_vc4_dev(dev);
233 -       struct vc4_exec_info *exec = vc4_first_job(vc4);
234 +       struct vc4_exec_info *exec;
235  
236 +again:
237 +       exec = vc4_first_bin_job(vc4);
238         if (!exec)
239                 return;
240  
241 @@ -405,11 +430,40 @@ vc4_submit_next_job(struct drm_device *d
242         V3D_WRITE(V3D_BPOA, 0);
243         V3D_WRITE(V3D_BPOS, 0);
244  
245 -       if (exec->ct0ca != exec->ct0ea)
246 +       /* Either put the job in the binner if it uses the binner, or
247 +        * immediately move it to the to-be-rendered queue.
248 +        */
249 +       if (exec->ct0ca != exec->ct0ea) {
250                 submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
251 +       } else {
252 +               vc4_move_job_to_render(dev, exec);
253 +               goto again;
254 +       }
255 +}
256 +
257 +void
258 +vc4_submit_next_render_job(struct drm_device *dev)
259 +{
260 +       struct vc4_dev *vc4 = to_vc4_dev(dev);
261 +       struct vc4_exec_info *exec = vc4_first_render_job(vc4);
262 +
263 +       if (!exec)
264 +               return;
265 +
266         submit_cl(dev, 1, exec->ct1ca, exec->ct1ea);
267  }
268  
269 +void
270 +vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec)
271 +{
272 +       struct vc4_dev *vc4 = to_vc4_dev(dev);
273 +       bool was_empty = list_empty(&vc4->render_job_list);
274 +
275 +       list_move_tail(&exec->head, &vc4->render_job_list);
276 +       if (was_empty)
277 +               vc4_submit_next_render_job(dev);
278 +}
279 +
280  static void
281  vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno)
282  {
283 @@ -448,14 +502,14 @@ vc4_queue_submit(struct drm_device *dev,
284         exec->seqno = seqno;
285         vc4_update_bo_seqnos(exec, seqno);
286  
287 -       list_add_tail(&exec->head, &vc4->job_list);
288 +       list_add_tail(&exec->head, &vc4->bin_job_list);
289  
290         /* If no job was executing, kick ours off.  Otherwise, it'll
291 -        * get started when the previous job's frame done interrupt
292 +        * get started when the previous job's flush done interrupt
293          * occurs.
294          */
295 -       if (vc4_first_job(vc4) == exec) {
296 -               vc4_submit_next_job(dev);
297 +       if (vc4_first_bin_job(vc4) == exec) {
298 +               vc4_submit_next_bin_job(dev);
299                 vc4_queue_hangcheck(dev);
300         }
301  
302 @@ -849,7 +903,8 @@ vc4_gem_init(struct drm_device *dev)
303  {
304         struct vc4_dev *vc4 = to_vc4_dev(dev);
305  
306 -       INIT_LIST_HEAD(&vc4->job_list);
307 +       INIT_LIST_HEAD(&vc4->bin_job_list);
308 +       INIT_LIST_HEAD(&vc4->render_job_list);
309         INIT_LIST_HEAD(&vc4->job_done_list);
310         INIT_LIST_HEAD(&vc4->seqno_cb_list);
311         spin_lock_init(&vc4->job_lock);
312 --- a/drivers/gpu/drm/vc4/vc4_irq.c
313 +++ b/drivers/gpu/drm/vc4/vc4_irq.c
314 @@ -30,6 +30,10 @@
315   * disables that specific interrupt, and 0s written are ignored
316   * (reading either one returns the set of enabled interrupts).
317   *
318 + * When we take a binning flush done interrupt, we need to submit the
319 + * next frame for binning and move the finished frame to the render
320 + * thread.
321 + *
322   * When we take a render frame interrupt, we need to wake the
323   * processes waiting for some frame to be done, and get the next frame
324   * submitted ASAP (so the hardware doesn't sit idle when there's work
325 @@ -44,6 +48,7 @@
326  #include "vc4_regs.h"
327  
328  #define V3D_DRIVER_IRQS (V3D_INT_OUTOMEM | \
329 +                        V3D_INT_FLDONE | \
330                          V3D_INT_FRDONE)
331  
332  DECLARE_WAIT_QUEUE_HEAD(render_wait);
333 @@ -77,7 +82,7 @@ vc4_overflow_mem_work(struct work_struct
334                 unsigned long irqflags;
335  
336                 spin_lock_irqsave(&vc4->job_lock, irqflags);
337 -               current_exec = vc4_first_job(vc4);
338 +               current_exec = vc4_first_bin_job(vc4);
339                 if (current_exec) {
340                         vc4->overflow_mem->seqno = vc4->finished_seqno + 1;
341                         list_add_tail(&vc4->overflow_mem->unref_head,
342 @@ -98,17 +103,43 @@ vc4_overflow_mem_work(struct work_struct
343  }
344  
345  static void
346 -vc4_irq_finish_job(struct drm_device *dev)
347 +vc4_irq_finish_bin_job(struct drm_device *dev)
348 +{
349 +       struct vc4_dev *vc4 = to_vc4_dev(dev);
350 +       struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
351 +
352 +       if (!exec)
353 +               return;
354 +
355 +       vc4_move_job_to_render(dev, exec);
356 +       vc4_submit_next_bin_job(dev);
357 +}
358 +
359 +static void
360 +vc4_cancel_bin_job(struct drm_device *dev)
361 +{
362 +       struct vc4_dev *vc4 = to_vc4_dev(dev);
363 +       struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
364 +
365 +       if (!exec)
366 +               return;
367 +
368 +       list_move_tail(&exec->head, &vc4->bin_job_list);
369 +       vc4_submit_next_bin_job(dev);
370 +}
371 +
372 +static void
373 +vc4_irq_finish_render_job(struct drm_device *dev)
374  {
375         struct vc4_dev *vc4 = to_vc4_dev(dev);
376 -       struct vc4_exec_info *exec = vc4_first_job(vc4);
377 +       struct vc4_exec_info *exec = vc4_first_render_job(vc4);
378  
379         if (!exec)
380                 return;
381  
382         vc4->finished_seqno++;
383         list_move_tail(&exec->head, &vc4->job_done_list);
384 -       vc4_submit_next_job(dev);
385 +       vc4_submit_next_render_job(dev);
386  
387         wake_up_all(&vc4->job_wait_queue);
388         schedule_work(&vc4->job_done_work);
389 @@ -125,9 +156,10 @@ vc4_irq(int irq, void *arg)
390         barrier();
391         intctl = V3D_READ(V3D_INTCTL);
392  
393 -       /* Acknowledge the interrupts we're handling here. The render
394 -        * frame done interrupt will be cleared, while OUTOMEM will
395 -        * stay high until the underlying cause is cleared.
396 +       /* Acknowledge the interrupts we're handling here. The binner
397 +        * last flush / render frame done interrupt will be cleared,
398 +        * while OUTOMEM will stay high until the underlying cause is
399 +        * cleared.
400          */
401         V3D_WRITE(V3D_INTCTL, intctl);
402  
403 @@ -138,9 +170,16 @@ vc4_irq(int irq, void *arg)
404                 status = IRQ_HANDLED;
405         }
406  
407 +       if (intctl & V3D_INT_FLDONE) {
408 +               spin_lock(&vc4->job_lock);
409 +               vc4_irq_finish_bin_job(dev);
410 +               spin_unlock(&vc4->job_lock);
411 +               status = IRQ_HANDLED;
412 +       }
413 +
414         if (intctl & V3D_INT_FRDONE) {
415                 spin_lock(&vc4->job_lock);
416 -               vc4_irq_finish_job(dev);
417 +               vc4_irq_finish_render_job(dev);
418                 spin_unlock(&vc4->job_lock);
419                 status = IRQ_HANDLED;
420         }
421 @@ -205,6 +244,7 @@ void vc4_irq_reset(struct drm_device *de
422         V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
423  
424         spin_lock_irqsave(&vc4->job_lock, irqflags);
425 -       vc4_irq_finish_job(dev);
426 +       vc4_cancel_bin_job(dev);
427 +       vc4_irq_finish_render_job(dev);
428         spin_unlock_irqrestore(&vc4->job_lock, irqflags);
429  }