lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 We should make the following changes to clean up MachineInstr:
   6
   7 1. Add an Opcode field to TargetInstrDescriptor, so you can tell the opcode of
   8    an instruction with just a TargetInstrDescriptor*.
   9 2. Remove the Opcode field from MachineInstr, replacing it with a
  10    TargetInstrDescriptor*.
  11 3. Getting information about a machine instr then becomes:
  12      MI->getInfo()->isTwoAddress()
  13    instead of:
  14      const TargetInstrInfo &TII = ...
  15      TII.isTwoAddrInstr(MI->getOpcode())
  16
  17 //===---------------------------------------------------------------------===//
  18
  19 FreeBench/mason contains code like this:
  20
  21 static p_type m0u(p_type p) {
  22   int m[]={0, 8, 1, 2, 16, 5, 13, 7, 14, 9, 3, 4, 11, 12, 15, 10, 17, 6};
  23   p_type pu;
  24   pu.a = m[p.a];
  25   pu.b = m[p.b];
  26   pu.c = m[p.c];
  27   return pu;
  28 }
  29
  30 We currently compile this into a memcpy from a static array into 'm', then
  31 a bunch of loads from m.  It would be better to avoid the memcpy and just do
  32 loads from the static array.
  33
  34 //===---------------------------------------------------------------------===//
  35
  36 Make the PPC branch selector target independant
  37
  38 //===---------------------------------------------------------------------===//
  39
  40 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  41 precision don't matter (ffastmath).  Misc/mandel will like this. :)
  42
  43 //===---------------------------------------------------------------------===//
  44
  45 Solve this DAG isel folding deficiency:
  46
  47 int X, Y;
  48
  49 void fn1(void)
  50 {
  51   X = X | (Y << 3);
  52 }
  53
  54 compiles to
  55
  56 fn1:
  57         movl Y, %eax
  58         shll $3, %eax
  59         orl X, %eax
  60         movl %eax, X
  61         ret
  62
  63 The problem is the store's chain operand is not the load X but rather
  64 a TokenFactor of the load X and load Y, which prevents the folding.
  65
  66 There are two ways to fix this:
  67
  68 1. The dag combiner can start using alias analysis to realize that y/x
  69    don't alias, making the store to X not dependent on the load from Y.
  70 2. The generated isel could be made smarter in the case it can't
  71    disambiguate the pointers.
  72
  73 Number 1 is the preferred solution.
  74
  75 This has been "fixed" by a TableGen hack. But that is a short term workaround
  76 which will be removed once the proper fix is made.
  77
  78 //===---------------------------------------------------------------------===//
  79
  80 On targets with expensive 64-bit multiply, we could LSR this:
  81
  82 for (i = ...; ++i) {
  83    x = 1ULL << i;
  84
  85 into:
  86  long long tmp = 1;
  87  for (i = ...; ++i, tmp+=tmp)
  88    x = tmp;
  89
  90 This would be a win on ppc32, but not x86 or ppc64.
  91
  92 //===---------------------------------------------------------------------===//
  93
  94 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
  95
  96 //===---------------------------------------------------------------------===//
  97
  98 Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
  99
 100 //===---------------------------------------------------------------------===//
 101
 102 Interesting? testcase for add/shift/mul reassoc:
 103
 104 int bar(int x, int y) {
 105   return x*x*x+y+x*x*x*x*x*y*y*y*y;
 106 }
 107 int foo(int z, int n) {
 108   return bar(z, n) + bar(2*z, 2*n);
 109 }
 110
 111 //===---------------------------------------------------------------------===//
 112
 113 These two functions should generate the same code on big-endian systems:
 114
 115 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 116 int h(int *j, int *l) {  return *j - *l; }
 117
 118 this could be done in SelectionDAGISel.cpp, along with other special cases,
 119 for 1,2,4,8 bytes.
 120
 121 //===---------------------------------------------------------------------===//
 122
 123 This code:
 124 int rot(unsigned char b) { int a = ((b>>1) ^ (b<<7)) & 0xff; return a; }
 125
 126 Can be improved in two ways:
 127
 128 1. The instcombiner should eliminate the type conversions.
 129 2. The X86 backend should turn this into a rotate by one bit.
 130
 131 //===---------------------------------------------------------------------===//
 132
 133 Add LSR exit value substitution. It'll probably be a win for Ackermann, etc.
 134
 135 //===---------------------------------------------------------------------===//
 136
 137 It would be nice to revert this patch:
 138 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 139
 140 And teach the dag combiner enough to simplify the code expanded before
 141 legalize.  It seems plausible that this knowledge would let it simplify other
 142 stuff too.
 143
 144 //===---------------------------------------------------------------------===//
 145
 146 For packed types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 147 to the type size. It works but can be overly conservative as the alignment of
 148 specific packed types are target dependent.
 149
 150 //===---------------------------------------------------------------------===//
 151
 152 We should add 'unaligned load/store' nodes, and produce them from code like
 153 this:
 154
 155 v4sf example(float *P) {
 156   return (v4sf){P[0], P[1], P[2], P[3] };
 157 }
 158
 159 //===---------------------------------------------------------------------===//
 160
 161 We should constant fold packed type casts at the LLVM level, regardless of the
 162 cast.  Currently we cannot fold some casts because we don't have TargetData
 163 information in the constant folder, so we don't know the endianness of the
 164 target!
 165
 166 //===---------------------------------------------------------------------===//
 167
 168 Add support for conditional increments, and other related patterns.  Instead
 169 of:
 170
 171         movl 136(%esp), %eax
 172         cmpl $0, %eax
 173         je LBB16_2      #cond_next
 174 LBB16_1:        #cond_true
 175         incl _foo
 176 LBB16_2:        #cond_next
 177
 178 emit:
 179         movl    _foo, %eax
 180         cmpl    $1, %edi
 181         sbbl    $-1, %eax
 182         movl    %eax, _foo
 183
 184 //===---------------------------------------------------------------------===//
 185
 186 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 187
 188 Expand these to calls of sin/cos and stores:
 189       double sincos(double x, double *sin, double *cos);
 190       float sincosf(float x, float *sin, float *cos);
 191       long double sincosl(long double x, long double *sin, long double *cos);
 192
 193 Doing so could allow SROA of the destination pointers.  See also:
 194 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 195
 196 //===---------------------------------------------------------------------===//
 197
 198 Scalar Repl cannot currently promote this testcase to 'ret long cst':
 199
 200         %struct.X = type { int, int }
 201         %struct.Y = type { %struct.X }
 202 ulong %bar() {
 203         %retval = alloca %struct.Y, align 8             ; <%struct.Y*> [#uses=3]
 204         %tmp12 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 0
 205         store int 0, int* %tmp12
 206         %tmp15 = getelementptr %struct.Y* %retval, int 0, uint 0, uint 1
 207         store int 1, int* %tmp15
 208         %retval = cast %struct.Y* %retval to ulong*
 209         %retval = load ulong* %retval           ; <ulong> [#uses=1]
 210         ret ulong %retval
 211 }
 212
 213 it should be extended to do so.
 214
 215 //===---------------------------------------------------------------------===//
 216
 217 Turn this into a single byte store with no load (the other 3 bytes are
 218 unmodified):
 219
 220 void %test(uint* %P) {
 221         %tmp = load uint* %P
 222         %tmp14 = or uint %tmp, 3305111552
 223         %tmp15 = and uint %tmp14, 3321888767
 224         store uint %tmp15, uint* %P
 225         ret void
 226 }
 227
 228 //===---------------------------------------------------------------------===//
 229
 230 dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
 231
 232 Compile:
 233
 234 int bar(int x)
 235 {
 236   int t = __builtin_clz(x);
 237   return -(t>>5);
 238 }
 239
 240 to:
 241
 242 _bar:   addic r3,r3,-1
 243         subfe r3,r3,r3
 244         blr
 245
 246 //===---------------------------------------------------------------------===//
 247
 248 Legalize should lower ctlz like this:
 249   ctlz(x) = popcnt((x-1) & ~x)
 250
 251 on targets that have popcnt but not ctlz.  itanium, what else?
 252
 253 //===---------------------------------------------------------------------===//
 254
 255 quantum_sigma_x in 462.libquantum contains the following loop:
 256
 257       for(i=0; i<reg->size; i++)
 258         {
 259           /* Flip the target bit of each basis state */
 260           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 261         }
 262
 263 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 264 so cool to turn it into something like:
 265
 266    long long Res = ((MAX_UNSIGNED) 1 << target);
 267    if (target < 32) {
 268      for(i=0; i<reg->size; i++)
 269        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 270    } else {
 271      for(i=0; i<reg->size; i++)
 272        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 273    }
 274
 275 ... which would only do one 32-bit XOR per loop iteration instead of two.
 276
 277 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 278 alas...
 279
 280 //===---------------------------------------------------------------------===//
 281
 282 This isn't recognized as bswap by instcombine:
 283
 284 unsigned int swap_32(unsigned int v) {
 285   v = ((v & 0x00ff00ffU) << 8)  | ((v & 0xff00ff00U) >> 8);
 286   v = ((v & 0x0000ffffU) << 16) | ((v & 0xffff0000U) >> 16);
 287   return v;
 288 }
 289
 290 Nor is this:
 291
 292 ushort %bad(ushort %a) {
 293 entry:
 294         %tmp = cast ushort %a to uint           ; <uint> [#uses=1]
 295         %tmp2 = shr uint %tmp, ubyte 8          ; <uint> [#uses=1]
 296         %tmp2 = cast uint %tmp2 to ushort               ; <ushort> [#uses=1]
 297         %tmp5 = shl ushort %a, ubyte 8          ; <ushort> [#uses=1]
 298         %tmp6 = or ushort %tmp2, %tmp5          ; <ushort> [#uses=1]
 299         ret ushort %tmp6
 300 }
 301
 302 unsigned short bad(unsigned short a) {
 303        return ((a & 0xff00) >> 8 | (a & 0x00ff) << 8);
 304 }
 305
 306 //===---------------------------------------------------------------------===//
 307
 308 These should turn into single 16-bit (unaligned?) loads on little/big endian
 309 processors.
 310
 311 unsigned short read_16_le(const unsigned char *adr) {
 312   return adr[0] | (adr[1] << 8);
 313 }
 314 unsigned short read_16_be(const unsigned char *adr) {
 315   return (adr[0] << 8) | adr[1];
 316 }
 317
 318 //===---------------------------------------------------------------------===//
 319
 320 -scalarrepl should promote this to be a vector scalar.
 321
 322         %struct..0anon = type { <4 x float> }
 323 implementation   ; Functions:
 324 void %test1(<4 x float> %V, float* %P) {
 325 entry:
 326         %u = alloca %struct..0anon, align 16            ; <%struct..0anon*> [#uses=2]
 327         %tmp = getelementptr %struct..0anon* %u, int 0, uint 0          ; <<4 x float>*> [#uses=1]
 328         store <4 x float> %V, <4 x float>* %tmp
 329         %tmp1 = cast %struct..0anon* %u to [4 x float]*         ; <[4 x float]*> [#uses=1]
 330         %tmp = getelementptr [4 x float]* %tmp1, int 0, int 1           ; <float*> [#uses=1]
 331         %tmp = load float* %tmp         ; <float> [#uses=1]
 332         %tmp3 = mul float %tmp, 2.000000e+00            ; <float> [#uses=1]
 333         store float %tmp3, float* %P
 334         ret void
 335 }
 336
 337 //===---------------------------------------------------------------------===//
 338
 339 -instcombine should handle this transform:
 340    setcc (sdiv X / C1 ), C2
 341 when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands.
 342
 343 Currently InstCombine avoids this transform but will do it when the signs of
 344 the operands and the sign of the divide match. See the FIXME in
 345 InstructionCombining.cpp in the visitSetCondInst method after the switch case
 346 for Instruction::UDiv (around line 4447) for more details.
 347
 348 The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
 349 this construct.
 350
 351 //===---------------------------------------------------------------------===//
 352
 353 Instcombine misses several of these cases (see the testcase in the patch):
 354 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg01519.html
 355
 356 //===---------------------------------------------------------------------===//
 357
 358 viterbi speeds up *significantly* if the various "history" related copy loops
 359 are turned into memcpy calls at the source level.  We need a "loops to memcpy"
 360 pass.
 361
 362 //===---------------------------------------------------------------------===//
 363
 364 -predsimplify should transform this:
 365
 366 void bad(unsigned x)
 367 {
 368   if (x > 4)
 369     bar(12);
 370   else if (x > 3)
 371     bar(523);
 372   else if (x > 2)
 373     bar(36);
 374   else if (x > 1)
 375     bar(65);
 376   else if (x > 0)
 377     bar(45);
 378   else
 379     bar(367);
 380 }
 381
 382 into:
 383
 384 void good(unsigned x)
 385 {
 386   if (x == 4)
 387     bar(523);
 388   else if (x == 3)
 389     bar(36);
 390   else if (x == 2)
 391     bar(65);
 392   else if (x == 1)
 393     bar(45);
 394   else if (x == 0)
 395     bar(367);
 396   else
 397     bar(12);
 398 }
 399
 400 to enable further optimizations.
 401
 402 //===---------------------------------------------------------------------===//