test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll

   1 ; RUN: opt %s -analyze -divergence | FileCheck %s
   2
   3 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
   4 target triple = "nvptx64-nvidia-cuda"
   5
   6 ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
   7 define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
   8 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge'
   9 entry:
  10   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  11   %cond = icmp slt i32 %n, 0
  12   br i1 %cond, label %then, label %else ; uniform
  13 ; CHECK-NOT: DIVERGENT: br i1 %cond,
  14 then:
  15   %a1 = add i32 %a, %tid
  16   br label %merge
  17 else:
  18   %b2 = add i32 %b, %tid
  19   br label %merge
  20 merge:
  21   %c = phi i32 [ %a1, %then ], [ %b2, %else ]
  22   ret i32 %c
  23 }
  24
  25 ; c = a;
  26 ; if (threadIdx.x < 5)    // divergent: data dependent
  27 ;   c = b;
  28 ; return c;               // c is divergent: sync dependent
  29 define i32 @sync(i32 %a, i32 %b) {
  30 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync'
  31 bb1:
  32   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  33   %cond = icmp slt i32 %tid, 5
  34   br i1 %cond, label %bb2, label %bb3
  35 ; CHECK: DIVERGENT: br i1 %cond,
  36 bb2:
  37   br label %bb3
  38 bb3:
  39   %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid
  40 ; CHECK: DIVERGENT: %c =
  41   ret i32 %c
  42 }
  43
  44 ; c = 0;
  45 ; if (threadIdx.x >= 5) {  // divergent
  46 ;   c = (n < 0 ? a : b);  // c here is uniform because n is uniform
  47 ; }
  48 ; // c here is divergent because it is sync dependent on threadIdx.x >= 5
  49 ; return c;
  50 define i32 @mixed(i32 %n, i32 %a, i32 %b) {
  51 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed'
  52 bb1:
  53   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
  54   %cond = icmp slt i32 %tid, 5
  55   br i1 %cond, label %bb6, label %bb2
  56 ; CHECK: DIVERGENT: br i1 %cond,
  57 bb2:
  58   %cond2 = icmp slt i32 %n, 0
  59   br i1 %cond2, label %bb4, label %bb3
  60 bb3:
  61   br label %bb5
  62 bb4:
  63   br label %bb5
  64 bb5:
  65   %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ]
  66 ; CHECK-NOT: DIVERGENT: %c =
  67   br label %bb6
  68 bb6:
  69   %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ]
  70 ; CHECK: DIVERGENT: %c2 =
  71   ret i32 %c2
  72 }
  73
  74 ; We conservatively treats all parameters of a __device__ function as divergent.
  75 define i32 @device(i32 %n, i32 %a, i32 %b) {
  76 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device'
  77 ; CHECK: DIVERGENT: i32 %n
  78 ; CHECK: DIVERGENT: i32 %a
  79 ; CHECK: DIVERGENT: i32 %b
  80 entry:
  81   %cond = icmp slt i32 %n, 0
  82   br i1 %cond, label %then, label %else
  83 ; CHECK: DIVERGENT: br i1 %cond,
  84 then:
  85   br label %merge
  86 else:
  87   br label %merge
  88 merge:
  89   %c = phi i32 [ %a, %then ], [ %b, %else ]
  90   ret i32 %c
  91 }
  92
  93 ; int i = 0;
  94 ; do {
  95 ;   i++;                  // i here is uniform
  96 ; } while (i < laneid);
  97 ; return i == 10 ? 0 : 1; // i here is divergent
  98 ;
  99 ; The i defined in the loop is used outside.
 100 define i32 @loop() {
 101 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop'
 102 entry:
 103   %laneid = call i32 @llvm.ptx.read.laneid()
 104   br label %loop
 105 loop:
 106   %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
 107 ; CHECK-NOT: DIVERGENT: %i =
 108   %i1 = add i32 %i, 1
 109   %exit_cond = icmp sge i32 %i1, %laneid
 110   br i1 %exit_cond, label %loop_exit, label %loop
 111 loop_exit:
 112   %cond = icmp eq i32 %i, 10
 113   br i1 %cond, label %then, label %else
 114 ; CHECK: DIVERGENT: br i1 %cond,
 115 then:
 116   ret i32 0
 117 else:
 118   ret i32 1
 119 }
 120
 121 ; Same as @loop, but the loop is in the LCSSA form.
 122 define i32 @lcssa() {
 123 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa'
 124 entry:
 125   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 126   br label %loop
 127 loop:
 128   %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
 129 ; CHECK-NOT: DIVERGENT: %i =
 130   %i1 = add i32 %i, 1
 131   %exit_cond = icmp sge i32 %i1, %tid
 132   br i1 %exit_cond, label %loop_exit, label %loop
 133 loop_exit:
 134   %i.lcssa = phi i32 [ %i, %loop ]
 135 ; CHECK: DIVERGENT: %i.lcssa =
 136   %cond = icmp eq i32 %i.lcssa, 10
 137   br i1 %cond, label %then, label %else
 138 ; CHECK: DIVERGENT: br i1 %cond,
 139 then:
 140   ret i32 0
 141 else:
 142   ret i32 1
 143 }
 144
 145 ; This test contains an unstructured loop.
 146 ;           +-------------- entry ----------------+
 147 ;           |                                     |
 148 ;           V                                     V
 149 ; i1 = phi(0, i3)                            i2 = phi(0, i3)
 150 ;     j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2
 151 ;           ^                 |                   ^
 152 ;           |                 V                   |
 153 ;           +-------- switch (tid / i3) ----------+
 154 ;                             |
 155 ;                             V
 156 ;                        if (i3 == 5) // divergent
 157 ; because sync dependent on (tid / i3).
 158 define i32 @unstructured_loop(i1 %entry_cond) {
 159 ; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop'
 160 entry:
 161   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 162   br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
 163 loop_entry_1:
 164   %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
 165   %j1 = add i32 %i1, 1
 166   br label %loop_body
 167 loop_entry_2:
 168   %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
 169   %j2 = add i32 %i2, 2
 170   br label %loop_body
 171 loop_body:
 172   %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ]
 173   br label %loop_latch
 174 loop_latch:
 175   %div = sdiv i32 %tid, %i3
 176   switch i32 %div, label %branch [ i32 1, label %loop_entry_1
 177                                    i32 2, label %loop_entry_2 ]
 178 branch:
 179   %cmp = icmp eq i32 %i3, 5
 180   br i1 %cmp, label %then, label %else
 181 ; CHECK: DIVERGENT: br i1 %cmp,
 182 then:
 183   ret i32 0
 184 else:
 185   ret i32 1
 186 }
 187
 188 ; Verifies sync-dependence is computed correctly in the absense of loops.
 189 define i32 @sync_no_loop(i32 %arg) {
 190 entry:
 191   %0 = add i32 %arg, 1
 192   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 193   %1 = icmp sge i32 %tid, 10
 194   br i1 %1, label %bb1, label %bb2
 195
 196 bb1:
 197   br label %bb3
 198
 199 bb2:
 200   br label %bb3
 201
 202 bb3:
 203   %2 = add i32 %0, 2
 204   ; CHECK-NOT: DIVERGENT: %2
 205   ret i32 %2
 206 }
 207
 208 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 209 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 210 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 211 declare i32 @llvm.ptx.read.laneid()
 212
 213 !nvvm.annotations = !{!0, !1, !2, !3, !4, !5}
 214 !0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1}
 215 !1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1}
 216 !2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1}
 217 !3 = !{i32 ()* @loop, !"kernel", i32 1}
 218 !4 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1}
 219 !5 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1}