lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 With the recent changes to make the implicit def/use set explicit in
   6 machineinstrs, we should change the target descriptions for 'call' instructions
   7 so that the .td files don't list all the call-clobbered registers as implicit
   8 defs.  Instead, these should be added by the code generator (e.g. on the dag).
   9
  10 This has a number of uses:
  11
  12 1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
  13    for their different impdef sets.
  14 2. Targets with multiple calling convs (e.g. x86) which have different clobber
  15    sets don't need copies of call instructions.
  16 3. 'Interprocedural register allocation' can be done to reduce the clobber sets
  17    of calls.
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 FreeBench/mason contains code like this:
  22
  23 typedef struct { int a; int b; int c; } p_type;
  24 extern int m[];
  25 p_type m0u(p_type *p) {
  26   int m[]={0, 8, 1, 2, 16, 5, 13, 7, 14, 9, 3, 4, 11, 12, 15, 10, 17, 6};
  27   p_type pu;
  28   pu.a = m[p->a];
  29   pu.b = m[p->b];
  30   pu.c = m[p->c];
  31   return pu;
  32 }
  33
  34 We currently compile this into a memcpy from a static array into 'm', then
  35 a bunch of loads from m.  It would be better to avoid the memcpy and just do
  36 loads from the static array.
  37
  38 //===---------------------------------------------------------------------===//
  39
  40 Make the PPC branch selector target independant
  41
  42 //===---------------------------------------------------------------------===//
  43
  44 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  45 precision don't matter (ffastmath).  Misc/mandel will like this. :)
  46
  47 //===---------------------------------------------------------------------===//
  48
  49 Solve this DAG isel folding deficiency:
  50
  51 int X, Y;
  52
  53 void fn1(void)
  54 {
  55   X = X | (Y << 3);
  56 }
  57
  58 compiles to
  59
  60 fn1:
  61         movl Y, %eax
  62         shll $3, %eax
  63         orl X, %eax
  64         movl %eax, X
  65         ret
  66
  67 The problem is the store's chain operand is not the load X but rather
  68 a TokenFactor of the load X and load Y, which prevents the folding.
  69
  70 There are two ways to fix this:
  71
  72 1. The dag combiner can start using alias analysis to realize that y/x
  73    don't alias, making the store to X not dependent on the load from Y.
  74 2. The generated isel could be made smarter in the case it can't
  75    disambiguate the pointers.
  76
  77 Number 1 is the preferred solution.
  78
  79 This has been "fixed" by a TableGen hack. But that is a short term workaround
  80 which will be removed once the proper fix is made.
  81
  82 //===---------------------------------------------------------------------===//
  83
  84 On targets with expensive 64-bit multiply, we could LSR this:
  85
  86 for (i = ...; ++i) {
  87    x = 1ULL << i;
  88
  89 into:
  90  long long tmp = 1;
  91  for (i = ...; ++i, tmp+=tmp)
  92    x = tmp;
  93
  94 This would be a win on ppc32, but not x86 or ppc64.
  95
  96 //===---------------------------------------------------------------------===//
  97
  98 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
  99
 100 //===---------------------------------------------------------------------===//
 101
 102 Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
 103
 104 //===---------------------------------------------------------------------===//
 105
 106 Interesting? testcase for add/shift/mul reassoc:
 107
 108 int bar(int x, int y) {
 109   return x*x*x+y+x*x*x*x*x*y*y*y*y;
 110 }
 111 int foo(int z, int n) {
 112   return bar(z, n) + bar(2*z, 2*n);
 113 }
 114
 115 //===---------------------------------------------------------------------===//
 116
 117 These two functions should generate the same code on big-endian systems:
 118
 119 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 120 int h(int *j, int *l) {  return *j - *l; }
 121
 122 this could be done in SelectionDAGISel.cpp, along with other special cases,
 123 for 1,2,4,8 bytes.
 124
 125 //===---------------------------------------------------------------------===//
 126
 127 It would be nice to revert this patch:
 128 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 129
 130 And teach the dag combiner enough to simplify the code expanded before
 131 legalize.  It seems plausible that this knowledge would let it simplify other
 132 stuff too.
 133
 134 //===---------------------------------------------------------------------===//
 135
 136 For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 137 to the type size. It works but can be overly conservative as the alignment of
 138 specific vector types are target dependent.
 139
 140 //===---------------------------------------------------------------------===//
 141
 142 We should add 'unaligned load/store' nodes, and produce them from code like
 143 this:
 144
 145 v4sf example(float *P) {
 146   return (v4sf){P[0], P[1], P[2], P[3] };
 147 }
 148
 149 //===---------------------------------------------------------------------===//
 150
 151 We should constant fold vector type casts at the LLVM level, regardless of the
 152 cast.  Currently we cannot fold some casts because we don't have TargetData
 153 information in the constant folder, so we don't know the endianness of the
 154 target!
 155
 156 //===---------------------------------------------------------------------===//
 157
 158 Add support for conditional increments, and other related patterns.  Instead
 159 of:
 160
 161         movl 136(%esp), %eax
 162         cmpl $0, %eax
 163         je LBB16_2      #cond_next
 164 LBB16_1:        #cond_true
 165         incl _foo
 166 LBB16_2:        #cond_next
 167
 168 emit:
 169         movl    _foo, %eax
 170         cmpl    $1, %edi
 171         sbbl    $-1, %eax
 172         movl    %eax, _foo
 173
 174 //===---------------------------------------------------------------------===//
 175
 176 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 177
 178 Expand these to calls of sin/cos and stores:
 179       double sincos(double x, double *sin, double *cos);
 180       float sincosf(float x, float *sin, float *cos);
 181       long double sincosl(long double x, long double *sin, long double *cos);
 182
 183 Doing so could allow SROA of the destination pointers.  See also:
 184 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 185
 186 //===---------------------------------------------------------------------===//
 187
 188 Scalar Repl cannot currently promote this testcase to 'ret long cst':
 189
 190         %struct.X = type { int, int }
 191         %struct.Y = type { %struct.X }
 192 ulong %bar() {
 193         %retval = alloca %struct.Y, align 8
 194         %tmp12 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 0
 195         store int 0, int* %tmp12
 196         %tmp15 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 1
 197         store int 1, int* %tmp15
 198         %retval = bitcast %struct.Y* %retval to ulong*
 199         %retval = load ulong* %retval
 200         ret ulong %retval
 201 }
 202
 203 it should be extended to do so.
 204
 205 //===---------------------------------------------------------------------===//
 206
 207 -scalarrepl should promote this to be a vector scalar.
 208
 209         %struct..0anon = type { <4 x float> }
 210
 211 implementation   ; Functions:
 212
 213 void %test1(<4 x float> %V, float* %P) {
 214         %u = alloca %struct..0anon, align 16
 215         %tmp = getelementptr %struct..0anon* %u, int 0, uint 0
 216         store <4 x float> %V, <4 x float>* %tmp
 217         %tmp1 = bitcast %struct..0anon* %u to [4 x float]*
 218         %tmp = getelementptr [4 x float]* %tmp1, int 0, int 1
 219         %tmp = load float* %tmp
 220         %tmp3 = mul float %tmp, 2.000000e+00
 221         store float %tmp3, float* %P
 222         ret void
 223 }
 224
 225 //===---------------------------------------------------------------------===//
 226
 227 Turn this into a single byte store with no load (the other 3 bytes are
 228 unmodified):
 229
 230 void %test(uint* %P) {
 231         %tmp = load uint* %P
 232         %tmp14 = or uint %tmp, 3305111552
 233         %tmp15 = and uint %tmp14, 3321888767
 234         store uint %tmp15, uint* %P
 235         ret void
 236 }
 237
 238 //===---------------------------------------------------------------------===//
 239
 240 dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
 241
 242 Compile:
 243
 244 int bar(int x)
 245 {
 246   int t = __builtin_clz(x);
 247   return -(t>>5);
 248 }
 249
 250 to:
 251
 252 _bar:   addic r3,r3,-1
 253         subfe r3,r3,r3
 254         blr
 255
 256 //===---------------------------------------------------------------------===//
 257
 258 Legalize should lower ctlz like this:
 259   ctlz(x) = popcnt((x-1) & ~x)
 260
 261 on targets that have popcnt but not ctlz.  itanium, what else?
 262
 263 //===---------------------------------------------------------------------===//
 264
 265 quantum_sigma_x in 462.libquantum contains the following loop:
 266
 267       for(i=0; i<reg->size; i++)
 268         {
 269           /* Flip the target bit of each basis state */
 270           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 271         }
 272
 273 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 274 so cool to turn it into something like:
 275
 276    long long Res = ((MAX_UNSIGNED) 1 << target);
 277    if (target < 32) {
 278      for(i=0; i<reg->size; i++)
 279        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 280    } else {
 281      for(i=0; i<reg->size; i++)
 282        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 283    }
 284
 285 ... which would only do one 32-bit XOR per loop iteration instead of two.
 286
 287 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 288 alas...
 289
 290 //===---------------------------------------------------------------------===//
 291
 292 This isn't recognized as bswap by instcombine:
 293
 294 unsigned int swap_32(unsigned int v) {
 295   v = ((v & 0x00ff00ffU) << 8)  | ((v & 0xff00ff00U) >> 8);
 296   v = ((v & 0x0000ffffU) << 16) | ((v & 0xffff0000U) >> 16);
 297   return v;
 298 }
 299
 300 Nor is this (yes, it really is bswap):
 301
 302 unsigned long reverse(unsigned v) {
 303     unsigned t;
 304     t = v ^ ((v << 16) | (v >> 16));
 305     t &= ~0xff0000;
 306     v = (v << 24) | (v >> 8);
 307     return v ^ (t >> 8);
 308 }
 309
 310 //===---------------------------------------------------------------------===//
 311
 312 These should turn into single 16-bit (unaligned?) loads on little/big endian
 313 processors.
 314
 315 unsigned short read_16_le(const unsigned char *adr) {
 316   return adr[0] | (adr[1] << 8);
 317 }
 318 unsigned short read_16_be(const unsigned char *adr) {
 319   return (adr[0] << 8) | adr[1];
 320 }
 321
 322 //===---------------------------------------------------------------------===//
 323
 324 -instcombine should handle this transform:
 325    icmp pred (sdiv X / C1 ), C2
 326 when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands.
 327
 328 Currently InstCombine avoids this transform but will do it when the signs of
 329 the operands and the sign of the divide match. See the FIXME in
 330 InstructionCombining.cpp in the visitSetCondInst method after the switch case
 331 for Instruction::UDiv (around line 4447) for more details.
 332
 333 The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
 334 this construct.
 335
 336 //===---------------------------------------------------------------------===//
 337
 338 Instcombine misses several of these cases (see the testcase in the patch):
 339 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg01519.html
 340
 341 //===---------------------------------------------------------------------===//
 342
 343 viterbi speeds up *significantly* if the various "history" related copy loops
 344 are turned into memcpy calls at the source level.  We need a "loops to memcpy"
 345 pass.
 346
 347 //===---------------------------------------------------------------------===//
 348
 349 Consider:
 350
 351 typedef unsigned U32;
 352 typedef unsigned long long U64;
 353 int test (U32 *inst, U64 *regs) {
 354     U64 effective_addr2;
 355     U32 temp = *inst;
 356     int r1 = (temp >> 20) & 0xf;
 357     int b2 = (temp >> 16) & 0xf;
 358     effective_addr2 = temp & 0xfff;
 359     if (b2) effective_addr2 += regs[b2];
 360     b2 = (temp >> 12) & 0xf;
 361     if (b2) effective_addr2 += regs[b2];
 362     effective_addr2 &= regs[4];
 363      if ((effective_addr2 & 3) == 0)
 364         return 1;
 365     return 0;
 366 }
 367
 368 Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
 369 we don't eliminate the computation of the top half of effective_addr2 because
 370 we don't have whole-function selection dags.  On x86, this means we use one
 371 extra register for the function when effective_addr2 is declared as U64 than
 372 when it is declared U32.
 373
 374 //===---------------------------------------------------------------------===//
 375
 376 Promote for i32 bswap can use i64 bswap + shr.  Useful on targets with 64-bit
 377 regs and bswap, like itanium.
 378
 379 //===---------------------------------------------------------------------===//
 380
 381 LSR should know what GPR types a target has.  This code:
 382
 383 volatile short X, Y; // globals
 384
 385 void foo(int N) {
 386   int i;
 387   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 388 }
 389
 390 produces two identical IV's (after promotion) on PPC/ARM:
 391
 392 LBB1_1: @bb.preheader
 393         mov r3, #0
 394         mov r2, r3
 395         mov r1, r3
 396 LBB1_2: @bb
 397         ldr r12, LCPI1_0
 398         ldr r12, [r12]
 399         strh r2, [r12]
 400         ldr r12, LCPI1_1
 401         ldr r12, [r12]
 402         strh r3, [r12]
 403         add r1, r1, #1    <- [0,+,1]
 404         add r3, r3, #4
 405         add r2, r2, #1    <- [0,+,1]
 406         cmp r1, r0
 407         bne LBB1_2      @bb
 408
 409
 410 //===---------------------------------------------------------------------===//
 411