lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 //===---------------------------------------------------------------------===//
  22
  23 This should be one DIV/IDIV instruction, not a libcall:
  24
  25 unsigned test(unsigned long long X, unsigned Y) {
  26         return X/Y;
  27 }
  28
  29 This can be done trivially with a custom legalizer.  What about overflow
  30 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 Some targets (e.g. athlons) prefer freep to fstp ST(0):
  35 http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 This should use fiadd on chips where it is profitable:
  40 double foo(double P, int *I) { return P+*I; }
  41
  42 //===---------------------------------------------------------------------===//
  43
  44 The FP stackifier needs to be global.  Also, it should handle simple permutates
  45 to reduce number of shuffle instructions, e.g. turning:
  46
  47 fld P   ->              fld Q
  48 fld Q                   fld P
  49 fxch
  50
  51 or:
  52
  53 fxch    ->              fucomi
  54 fucomi                  jl X
  55 jg X
  56
  57 Ideas:
  58 http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
  59
  60
  61 //===---------------------------------------------------------------------===//
  62
  63 Improvements to the multiply -> shift/add algorithm:
  64 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  65
  66 //===---------------------------------------------------------------------===//
  67
  68 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  69 long long foo(int x) { return 1LL << x; }
  70
  71 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  72 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  73 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  74
  75 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  76
  77 //===---------------------------------------------------------------------===//
  78
  79 Compile this:
  80 _Bool f(_Bool a) { return a!=1; }
  81
  82 into:
  83         movzbl  %dil, %eax
  84         xorl    $1, %eax
  85         ret
  86
  87 //===---------------------------------------------------------------------===//
  88
  89 Some isel ideas:
  90
  91 1. Dynamic programming based approach when compile time if not an
  92    issue.
  93 2. Code duplication (addressing mode) during isel.
  94 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  95    Sequencing of Instructions".
  96 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  97    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  98    and other related papers.
  99    http://citeseer.ist.psu.edu/govindarajan01minimum.html
 100
 101 //===---------------------------------------------------------------------===//
 102
 103 Should we promote i16 to i32 to avoid partial register update stalls?
 104
 105 //===---------------------------------------------------------------------===//
 106
 107 Leave any_extend as pseudo instruction and hint to register
 108 allocator. Delay codegen until post register allocation.
 109
 110 //===---------------------------------------------------------------------===//
 111
 112 Add a target specific hook to DAG combiner to handle SINT_TO_FP and
 113 FP_TO_SINT when the source operand is already in memory.
 114
 115 //===---------------------------------------------------------------------===//
 116
 117 Model X86 EFLAGS as a real register to avoid redudant cmp / test. e.g.
 118
 119         cmpl $1, %eax
 120         setg %al
 121         testb %al, %al  # unnecessary
 122         jne .BB7
 123
 124 //===---------------------------------------------------------------------===//
 125
 126 Count leading zeros and count trailing zeros:
 127
 128 int clz(int X) { return __builtin_clz(X); }
 129 int ctz(int X) { return __builtin_ctz(X); }
 130
 131 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 132 clz:
 133         bsr     %eax, DWORD PTR [%esp+4]
 134         xor     %eax, 31
 135         ret
 136 ctz:
 137         bsf     %eax, DWORD PTR [%esp+4]
 138         ret
 139
 140 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 141 aren't.
 142
 143 //===---------------------------------------------------------------------===//
 144
 145 Use push/pop instructions in prolog/epilog sequences instead of stores off
 146 ESP (certain code size win, perf win on some [which?] processors).
 147
 148 //===---------------------------------------------------------------------===//
 149
 150 Only use inc/neg/not instructions on processors where they are faster than
 151 add/sub/xor.  They are slower on the P4 due to only updating some processor
 152 flags.
 153
 154 //===---------------------------------------------------------------------===//
 155
 156 Open code rint,floor,ceil,trunc:
 157 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
 158 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
 159
 160 //===---------------------------------------------------------------------===//
 161
 162 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 163
 164 Expand these to calls of sin/cos and stores:
 165       double sincos(double x, double *sin, double *cos);
 166       float sincosf(float x, float *sin, float *cos);
 167       long double sincosl(long double x, long double *sin, long double *cos);
 168
 169 Doing so could allow SROA of the destination pointers.  See also:
 170 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 171
 172 //===---------------------------------------------------------------------===//
 173
 174 The instruction selector sometimes misses folding a load into a compare.  The
 175 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 176 commutative, it is not matched with the load on both sides.  The dag combiner
 177 should be made smart enough to cannonicalize the load into the RHS of a compare
 178 when it can invert the result of the compare for free.
 179
 180 //===---------------------------------------------------------------------===//
 181
 182 LSR should be turned on for the X86 backend and tuned to take advantage of its
 183 addressing modes.
 184
 185 //===---------------------------------------------------------------------===//
 186
 187 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
 188 other fast SSE modes.
 189
 190 //===---------------------------------------------------------------------===//
 191
 192 Think about doing i64 math in SSE regs.
 193
 194 //===---------------------------------------------------------------------===//
 195
 196 The DAG Isel doesn't fold the loads into the adds in this testcase.  The
 197 pattern selector does.  This is because the chain value of the load gets
 198 selected first, and the loads aren't checking to see if they are only used by
 199 and add.
 200
 201 .ll:
 202
 203 int %test(int* %x, int* %y, int* %z) {
 204         %X = load int* %x
 205         %Y = load int* %y
 206         %Z = load int* %z
 207         %a = add int %X, %Y
 208         %b = add int %a, %Z
 209         ret int %b
 210 }
 211
 212 dag isel:
 213
 214 _test:
 215         movl 4(%esp), %eax
 216         movl (%eax), %eax
 217         movl 8(%esp), %ecx
 218         movl (%ecx), %ecx
 219         addl %ecx, %eax
 220         movl 12(%esp), %ecx
 221         movl (%ecx), %ecx
 222         addl %ecx, %eax
 223         ret
 224
 225 pattern isel:
 226
 227 _test:
 228         movl 12(%esp), %ecx
 229         movl 4(%esp), %edx
 230         movl 8(%esp), %eax
 231         movl (%eax), %eax
 232         addl (%edx), %eax
 233         addl (%ecx), %eax
 234         ret
 235
 236 This is bad for register pressure, though the dag isel is producing a
 237 better schedule. :)
 238
 239 //===---------------------------------------------------------------------===//
 240
 241 This testcase should have no SSE instructions in it, and only one load from
 242 a constant pool:
 243
 244 double %test3(bool %B) {
 245         %C = select bool %B, double 123.412, double 523.01123123
 246         ret double %C
 247 }
 248
 249 Currently, the select is being lowered, which prevents the dag combiner from
 250 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
 251
 252 The pattern isel got this one right.
 253
 254 //===---------------------------------------------------------------------===//
 255
 256 We need to lower switch statements to tablejumps when appropriate instead of
 257 always into binary branch trees.
 258
 259 //===---------------------------------------------------------------------===//
 260
 261 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
 262 like this:
 263
 264   X += y
 265
 266 and the register allocator decides to spill X, it is cheaper to emit this as:
 267
 268 Y += [xslot]
 269 store Y -> [xslot]
 270
 271 than as:
 272
 273 tmp = [xslot]
 274 tmp += y
 275 store tmp -> [xslot]
 276
 277 ..and this uses one fewer register (so this should be done at load folding
 278 time, not at spiller time).  *Note* however that this can only be done
 279 if Y is dead.  Here's a testcase:
 280
 281 %.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
 282 implementation   ; Functions:
 283 declare void %printf(int, ...)
 284 void %main() {
 285 build_tree.exit:
 286         br label %no_exit.i7
 287 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
 288         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
 289         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
 290         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
 291         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
 292         br bool false, label %Compute_Tree.exit23, label %no_exit.i7
 293 Compute_Tree.exit23:            ; preds = %no_exit.i7
 294         tail call void (int, ...)* %printf( int 0 )
 295         store double %tmp.34.i18, double* null
 296         ret void
 297 }
 298
 299 We currently emit:
 300
 301 .BBmain_1:
 302         xorpd %XMM1, %XMM1
 303         addsd %XMM0, %XMM1
 304 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
 305 ***     addsd %XMM2, %XMM1
 306 ***     movsd QWORD PTR [%ESP + 8], %XMM2
 307         jmp .BBmain_1   # no_exit.i7
 308
 309 This is a bugpoint reduced testcase, which is why the testcase doesn't make
 310 much sense (e.g. its an infinite loop). :)
 311
 312 //===---------------------------------------------------------------------===//
 313
 314 None of the FPStack instructions are handled in
 315 X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
 316 folding spill code into the instructions.
 317
 318 //===---------------------------------------------------------------------===//
 319
 320 In many cases, LLVM generates code like this:
 321
 322 _test:
 323         movl 8(%esp), %eax
 324         cmpl %eax, 4(%esp)
 325         setl %al
 326         movzbl %al, %eax
 327         ret
 328
 329 on some processors (which ones?), it is more efficient to do this:
 330
 331 _test:
 332         movl 8(%esp), %ebx
 333         xor %eax, %eax
 334         cmpl %ebx, 4(%esp)
 335         setl %al
 336         ret
 337
 338 Doing this correctly is tricky though, as the xor clobbers the flags.
 339
 340 //===---------------------------------------------------------------------===//
 341
 342 We should generate 'test' instead of 'cmp' in various cases, e.g.:
 343
 344 bool %test(int %X) {
 345         %Y = shl int %X, ubyte 1
 346         %C = seteq int %Y, 0
 347         ret bool %C
 348 }
 349 bool %test(int %X) {
 350         %Y = and int %X, 8
 351         %C = seteq int %Y, 0
 352         ret bool %C
 353 }
 354
 355 This may just be a matter of using 'test' to write bigger patterns for X86cmp.
 356
 357 //===---------------------------------------------------------------------===//
 358
 359 SSE should implement 'select_cc' using 'emulated conditional moves' that use
 360 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
 361
 362 double %X(double %Y, double %Z, double %A, double %B) {
 363         %C = setlt double %A, %B
 364         %z = add double %Z, 0.0    ;; select operand is not a load
 365         %D = select bool %C, double %Y, double %z
 366         ret double %D
 367 }
 368
 369 We currently emit:
 370
 371 _X:
 372         subl $12, %esp
 373         xorpd %xmm0, %xmm0
 374         addsd 24(%esp), %xmm0
 375         movsd 32(%esp), %xmm1
 376         movsd 16(%esp), %xmm2
 377         ucomisd 40(%esp), %xmm1
 378         jb LBB_X_2
 379 LBB_X_1:
 380         movsd %xmm0, %xmm2
 381 LBB_X_2:
 382         movsd %xmm2, (%esp)
 383         fldl (%esp)
 384         addl $12, %esp
 385         ret
 386
 387 //===---------------------------------------------------------------------===//
 388
 389 The x86 backend currently supports dynamic-no-pic. Need to add asm
 390 printer support for static and PIC.
 391
 392 //===---------------------------------------------------------------------===//
 393
 394 We should generate bts/btr/etc instructions on targets where they are cheap or
 395 when codesize is important.  e.g., for:
 396
 397 void setbit(int *target, int bit) {
 398     *target |= (1 << bit);
 399 }
 400 void clearbit(int *target, int bit) {
 401     *target &= ~(1 << bit);
 402 }
 403
 404 //===---------------------------------------------------------------------===//
 405
 406 Easy: Global addresses are not always allowed as immediates.  For this:
 407
 408 int dst = 0; int *ptr = 0;
 409 void foo() { ptr = &dst; }
 410
 411 we get this:
 412
 413 _foo:
 414         movl $_dst, %eax
 415         movl %eax, _ptr
 416         ret
 417
 418 When: "movl $_dst, _ptr" is sufficient.
 419
 420 //===---------------------------------------------------------------------===//
 421
 422 Use fisttp to do FP to integer conversion whenever it is available.
 423
 424 //===---------------------------------------------------------------------===//
 425
 426 Instead of the following for memset char*, 1, 10:
 427
 428         movl $16843009, 4(%edx)
 429         movl $16843009, (%edx)
 430         movw $257, 8(%edx)
 431
 432 It might be better to generate
 433
 434         movl $16843009, %eax
 435         movl %eax, 4(%edx)
 436         movl %eax, (%edx)
 437         movw al, 8(%edx)
 438
 439 when we can spare a register. It reduces code size.