lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 We should make the following changes to clean up MachineInstr:
   6
   7 1. Add an Opcode field to TargetInstrDescriptor, so you can tell the opcode of
   8    an instruction with just a TargetInstrDescriptor*.
   9 2. Remove the Opcode field from MachineInstr, replacing it with a
  10    TargetInstrDescriptor*.
  11 3. Getting information about a machine instr then becomes:
  12      MI->getInfo()->isTwoAddress()
  13    instead of:
  14      const TargetInstrInfo &TII = ...
  15      TII.isTwoAddrInstr(MI->getOpcode())
  16
  17 //===---------------------------------------------------------------------===//
  18
  19 With the recent changes to make the implicit def/use set explicit in
  20 machineinstrs, we should change the target descriptions for 'call' instructions
  21 so that the .td files don't list all the call-clobbered registers as implicit
  22 defs.  Instead, these should be added by the code generator (e.g. on the dag).
  23
  24 This has a number of uses:
  25
  26 1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
  27    for their different impdef sets.
  28 2. Targets with multiple calling convs (e.g. x86) which have different clobber
  29    sets don't need copies of call instructions.
  30 3. 'Interprocedural register allocation' can be done to reduce the clobber sets
  31    of calls.
  32
  33 //===---------------------------------------------------------------------===//
  34
  35 FreeBench/mason contains code like this:
  36
  37 static p_type m0u(p_type p) {
  38   int m[]={0, 8, 1, 2, 16, 5, 13, 7, 14, 9, 3, 4, 11, 12, 15, 10, 17, 6};
  39   p_type pu;
  40   pu.a = m[p.a];
  41   pu.b = m[p.b];
  42   pu.c = m[p.c];
  43   return pu;
  44 }
  45
  46 We currently compile this into a memcpy from a static array into 'm', then
  47 a bunch of loads from m.  It would be better to avoid the memcpy and just do
  48 loads from the static array.
  49
  50 //===---------------------------------------------------------------------===//
  51
  52 Make the PPC branch selector target independant
  53
  54 //===---------------------------------------------------------------------===//
  55
  56 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  57 precision don't matter (ffastmath).  Misc/mandel will like this. :)
  58
  59 //===---------------------------------------------------------------------===//
  60
  61 Solve this DAG isel folding deficiency:
  62
  63 int X, Y;
  64
  65 void fn1(void)
  66 {
  67   X = X | (Y << 3);
  68 }
  69
  70 compiles to
  71
  72 fn1:
  73         movl Y, %eax
  74         shll $3, %eax
  75         orl X, %eax
  76         movl %eax, X
  77         ret
  78
  79 The problem is the store's chain operand is not the load X but rather
  80 a TokenFactor of the load X and load Y, which prevents the folding.
  81
  82 There are two ways to fix this:
  83
  84 1. The dag combiner can start using alias analysis to realize that y/x
  85    don't alias, making the store to X not dependent on the load from Y.
  86 2. The generated isel could be made smarter in the case it can't
  87    disambiguate the pointers.
  88
  89 Number 1 is the preferred solution.
  90
  91 This has been "fixed" by a TableGen hack. But that is a short term workaround
  92 which will be removed once the proper fix is made.
  93
  94 //===---------------------------------------------------------------------===//
  95
  96 On targets with expensive 64-bit multiply, we could LSR this:
  97
  98 for (i = ...; ++i) {
  99    x = 1ULL << i;
 100
 101 into:
 102  long long tmp = 1;
 103  for (i = ...; ++i, tmp+=tmp)
 104    x = tmp;
 105
 106 This would be a win on ppc32, but not x86 or ppc64.
 107
 108 //===---------------------------------------------------------------------===//
 109
 110 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
 111
 112 //===---------------------------------------------------------------------===//
 113
 114 Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
 115
 116 //===---------------------------------------------------------------------===//
 117
 118 Interesting? testcase for add/shift/mul reassoc:
 119
 120 int bar(int x, int y) {
 121   return x*x*x+y+x*x*x*x*x*y*y*y*y;
 122 }
 123 int foo(int z, int n) {
 124   return bar(z, n) + bar(2*z, 2*n);
 125 }
 126
 127 //===---------------------------------------------------------------------===//
 128
 129 These two functions should generate the same code on big-endian systems:
 130
 131 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 132 int h(int *j, int *l) {  return *j - *l; }
 133
 134 this could be done in SelectionDAGISel.cpp, along with other special cases,
 135 for 1,2,4,8 bytes.
 136
 137 //===---------------------------------------------------------------------===//
 138
 139 This code:
 140 int rot(unsigned char b) { int a = ((b>>1) ^ (b<<7)) & 0xff; return a; }
 141
 142 Can be improved in two ways:
 143
 144 1. The instcombiner should eliminate the type conversions.
 145 2. The X86 backend should turn this into a rotate by one bit.
 146
 147 //===---------------------------------------------------------------------===//
 148
 149 Add LSR exit value substitution. It'll probably be a win for Ackermann, etc.
 150
 151 //===---------------------------------------------------------------------===//
 152
 153 It would be nice to revert this patch:
 154 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 155
 156 And teach the dag combiner enough to simplify the code expanded before
 157 legalize.  It seems plausible that this knowledge would let it simplify other
 158 stuff too.
 159
 160 //===---------------------------------------------------------------------===//
 161
 162 For packed types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 163 to the type size. It works but can be overly conservative as the alignment of
 164 specific packed types are target dependent.
 165
 166 //===---------------------------------------------------------------------===//
 167
 168 We should add 'unaligned load/store' nodes, and produce them from code like
 169 this:
 170
 171 v4sf example(float *P) {
 172   return (v4sf){P[0], P[1], P[2], P[3] };
 173 }
 174
 175 //===---------------------------------------------------------------------===//
 176
 177 We should constant fold packed type casts at the LLVM level, regardless of the
 178 cast.  Currently we cannot fold some casts because we don't have TargetData
 179 information in the constant folder, so we don't know the endianness of the
 180 target!
 181
 182 //===---------------------------------------------------------------------===//
 183
 184 Add support for conditional increments, and other related patterns.  Instead
 185 of:
 186
 187         movl 136(%esp), %eax
 188         cmpl $0, %eax
 189         je LBB16_2      #cond_next
 190 LBB16_1:        #cond_true
 191         incl _foo
 192 LBB16_2:        #cond_next
 193
 194 emit:
 195         movl    _foo, %eax
 196         cmpl    $1, %edi
 197         sbbl    $-1, %eax
 198         movl    %eax, _foo
 199
 200 //===---------------------------------------------------------------------===//
 201
 202 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 203
 204 Expand these to calls of sin/cos and stores:
 205       double sincos(double x, double *sin, double *cos);
 206       float sincosf(float x, float *sin, float *cos);
 207       long double sincosl(long double x, long double *sin, long double *cos);
 208
 209 Doing so could allow SROA of the destination pointers.  See also:
 210 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 211
 212 //===---------------------------------------------------------------------===//
 213
 214 Scalar Repl cannot currently promote this testcase to 'ret long cst':
 215
 216         %struct.X = type { int, int }
 217         %struct.Y = type { %struct.X }
 218 ulong %bar() {
 219         %retval = alloca %struct.Y, align 8             ; <%struct.Y*> [#uses=3]
 220         %tmp12 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 0
 221         store int 0, int* %tmp12
 222         %tmp15 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 1
 223         store int 1, int* %tmp15
 224         %retval = cast %struct.Y* %retval to ulong*
 225         %retval = load ulong* %retval           ; <ulong> [#uses=1]
 226         ret ulong %retval
 227 }
 228
 229 it should be extended to do so.
 230
 231 //===---------------------------------------------------------------------===//
 232
 233 Turn this into a single byte store with no load (the other 3 bytes are
 234 unmodified):
 235
 236 void %test(uint* %P) {
 237         %tmp = load uint* %P
 238         %tmp14 = or uint %tmp, 3305111552
 239         %tmp15 = and uint %tmp14, 3321888767
 240         store uint %tmp15, uint* %P
 241         ret void
 242 }
 243
 244 //===---------------------------------------------------------------------===//
 245
 246 dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
 247
 248 Compile:
 249
 250 int bar(int x)
 251 {
 252   int t = __builtin_clz(x);
 253   return -(t>>5);
 254 }
 255
 256 to:
 257
 258 _bar:   addic r3,r3,-1
 259         subfe r3,r3,r3
 260         blr
 261
 262 //===---------------------------------------------------------------------===//
 263
 264 Legalize should lower ctlz like this:
 265   ctlz(x) = popcnt((x-1) & ~x)
 266
 267 on targets that have popcnt but not ctlz.  itanium, what else?
 268
 269 //===---------------------------------------------------------------------===//
 270
 271 quantum_sigma_x in 462.libquantum contains the following loop:
 272
 273       for(i=0; i<reg->size; i++)
 274         {
 275           /* Flip the target bit of each basis state */
 276           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 277         }
 278
 279 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 280 so cool to turn it into something like:
 281
 282    long long Res = ((MAX_UNSIGNED) 1 << target);
 283    if (target < 32) {
 284      for(i=0; i<reg->size; i++)
 285        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 286    } else {
 287      for(i=0; i<reg->size; i++)
 288        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 289    }
 290
 291 ... which would only do one 32-bit XOR per loop iteration instead of two.
 292
 293 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 294 alas...
 295
 296 //===---------------------------------------------------------------------===//
 297
 298 This isn't recognized as bswap by instcombine:
 299
 300 unsigned int swap_32(unsigned int v) {
 301   v = ((v & 0x00ff00ffU) << 8)  | ((v & 0xff00ff00U) >> 8);
 302   v = ((v & 0x0000ffffU) << 16) | ((v & 0xffff0000U) >> 16);
 303   return v;
 304 }
 305
 306 //===---------------------------------------------------------------------===//
 307
 308 These should turn into single 16-bit (unaligned?) loads on little/big endian
 309 processors.
 310
 311 unsigned short read_16_le(const unsigned char *adr) {
 312   return adr[0] | (adr[1] << 8);
 313 }
 314 unsigned short read_16_be(const unsigned char *adr) {
 315   return (adr[0] << 8) | adr[1];
 316 }
 317
 318 //===---------------------------------------------------------------------===//
 319
 320 -scalarrepl should promote this to be a vector scalar.
 321
 322         %struct..0anon = type { <4 x float> }
 323 implementation   ; Functions:
 324 void %test1(<4 x float> %V, float* %P) {
 325 entry:
 326         %u = alloca %struct..0anon, align 16            ; <%struct..0anon*> [#uses=2]
 327         %tmp = getelementptr %struct..0anon* %u, int 0, uint 0          ; <<4 x float>*> [#uses=1]
 328         store <4 x float> %V, <4 x float>* %tmp
 329         %tmp1 = cast %struct..0anon* %u to [4 x float]*         ; <[4 x float]*> [#uses=1]
 330         %tmp = getelementptr [4 x float]* %tmp1, int 0, int 1           ; <float*> [#uses=1]
 331         %tmp = load float* %tmp         ; <float> [#uses=1]
 332         %tmp3 = mul float %tmp, 2.000000e+00            ; <float> [#uses=1]
 333         store float %tmp3, float* %P
 334         ret void
 335 }
 336
 337 //===---------------------------------------------------------------------===//
 338
 339 -instcombine should handle this transform:
 340    setcc (sdiv X / C1 ), C2
 341 when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands.
 342
 343 Currently InstCombine avoids this transform but will do it when the signs of
 344 the operands and the sign of the divide match. See the FIXME in
 345 InstructionCombining.cpp in the visitSetCondInst method after the switch case
 346 for Instruction::UDiv (around line 4447) for more details.
 347
 348 The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
 349 this construct.
 350
 351 //===---------------------------------------------------------------------===//
 352
 353 Instcombine misses several of these cases (see the testcase in the patch):
 354 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg01519.html
 355
 356 //===---------------------------------------------------------------------===//
 357
 358 viterbi speeds up *significantly* if the various "history" related copy loops
 359 are turned into memcpy calls at the source level.  We need a "loops to memcpy"
 360 pass.
 361
 362 //===---------------------------------------------------------------------===//
 363
 364 -predsimplify should transform this:
 365
 366 void bad(unsigned x)
 367 {
 368   if (x > 4)
 369     bar(12);
 370   else if (x > 3)
 371     bar(523);
 372   else if (x > 2)
 373     bar(36);
 374   else if (x > 1)
 375     bar(65);
 376   else if (x > 0)
 377     bar(45);
 378   else
 379     bar(367);
 380 }
 381
 382 into:
 383
 384 void good(unsigned x)
 385 {
 386   if (x == 4)
 387     bar(523);
 388   else if (x == 3)
 389     bar(36);
 390   else if (x == 2)
 391     bar(65);
 392   else if (x == 1)
 393     bar(45);
 394   else if (x == 0)
 395     bar(367);
 396   else
 397     bar(12);
 398 }
 399
 400 to enable further optimizations.
 401
 402 //===---------------------------------------------------------------------===//