lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 //===---------------------------------------------------------------------===//
   6
   7 Expand libm rounding functions inline:  Significant speedups possible.
   8 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
   9
  10 //===---------------------------------------------------------------------===//
  11
  12 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  13 other fast SSE modes.
  14
  15 //===---------------------------------------------------------------------===//
  16
  17 Think about doing i64 math in SSE regs.
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 This testcase should have no SSE instructions in it, and only one load from
  22 a constant pool:
  23
  24 double %test3(bool %B) {
  25         %C = select bool %B, double 123.412, double 523.01123123
  26         ret double %C
  27 }
  28
  29 Currently, the select is being lowered, which prevents the dag combiner from
  30 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  31
  32 The pattern isel got this one right.
  33
  34 //===---------------------------------------------------------------------===//
  35
  36 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
  37 like this:
  38
  39   X += y
  40
  41 and the register allocator decides to spill X, it is cheaper to emit this as:
  42
  43 Y += [xslot]
  44 store Y -> [xslot]
  45
  46 than as:
  47
  48 tmp = [xslot]
  49 tmp += y
  50 store tmp -> [xslot]
  51
  52 ..and this uses one fewer register (so this should be done at load folding
  53 time, not at spiller time).  *Note* however that this can only be done
  54 if Y is dead.  Here's a testcase:
  55
  56 %.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
  57 implementation   ; Functions:
  58 declare void %printf(int, ...)
  59 void %main() {
  60 build_tree.exit:
  61         br label %no_exit.i7
  62 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
  63         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
  64         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
  65         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
  66         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
  67         br bool false, label %Compute_Tree.exit23, label %no_exit.i7
  68 Compute_Tree.exit23:            ; preds = %no_exit.i7
  69         tail call void (int, ...)* %printf( int 0 )
  70         store double %tmp.34.i18, double* null
  71         ret void
  72 }
  73
  74 We currently emit:
  75
  76 .BBmain_1:
  77         xorpd %XMM1, %XMM1
  78         addsd %XMM0, %XMM1
  79 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
  80 ***     addsd %XMM2, %XMM1
  81 ***     movsd QWORD PTR [%ESP + 8], %XMM2
  82         jmp .BBmain_1   # no_exit.i7
  83
  84 This is a bugpoint reduced testcase, which is why the testcase doesn't make
  85 much sense (e.g. its an infinite loop). :)
  86
  87 //===---------------------------------------------------------------------===//
  88
  89 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  90 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  91
  92 double %X(double %Y, double %Z, double %A, double %B) {
  93         %C = setlt double %A, %B
  94         %z = add double %Z, 0.0    ;; select operand is not a load
  95         %D = select bool %C, double %Y, double %z
  96         ret double %D
  97 }
  98
  99 We currently emit:
 100
 101 _X:
 102         subl $12, %esp
 103         xorpd %xmm0, %xmm0
 104         addsd 24(%esp), %xmm0
 105         movsd 32(%esp), %xmm1
 106         movsd 16(%esp), %xmm2
 107         ucomisd 40(%esp), %xmm1
 108         jb LBB_X_2
 109 LBB_X_1:
 110         movsd %xmm0, %xmm2
 111 LBB_X_2:
 112         movsd %xmm2, (%esp)
 113         fldl (%esp)
 114         addl $12, %esp
 115         ret
 116
 117 //===---------------------------------------------------------------------===//
 118
 119 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
 120 registers. The choice may depend on subtarget information. We should do some
 121 more experiments on different x86 machines.
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 Currently the x86 codegen isn't very good at mixing SSE and FPStack
 126 code:
 127
 128 unsigned int foo(double x) { return x; }
 129
 130 foo:
 131         subl $20, %esp
 132         movsd 24(%esp), %xmm0
 133         movsd %xmm0, 8(%esp)
 134         fldl 8(%esp)
 135         fisttpll (%esp)
 136         movl (%esp), %eax
 137         addl $20, %esp
 138         ret
 139
 140 This will be solved when we go to a dynamic programming based isel.
 141
 142 //===---------------------------------------------------------------------===//
 143
 144 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 145 feasible.
 146
 147 //===---------------------------------------------------------------------===//
 148
 149 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
 150 the reg-reg copy in this example:
 151
 152 float foo(int *x, float *y, unsigned c) {
 153   float res = 0.0;
 154   unsigned i;
 155   for (i = 0; i < c; i++) {
 156     float xx = (float)x[i];
 157     xx = xx * y[i];
 158     xx += res;
 159     res = xx;
 160   }
 161   return res;
 162 }
 163
 164 LBB_foo_3:      # no_exit
 165         cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
 166         mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
 167         addss %XMM0, %XMM1
 168         inc %ESI
 169         cmp %ESI, %ECX
 170 ****    movaps %XMM1, %XMM0
 171         jb LBB_foo_3    # no_exit
 172
 173 //===---------------------------------------------------------------------===//
 174
 175 Codegen:
 176   if (copysign(1.0, x) == copysign(1.0, y))
 177 into:
 178   if (x^y & mask)
 179 when using SSE.
 180
 181 //===---------------------------------------------------------------------===//
 182
 183 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 184 of a v4sf value.
 185
 186 //===---------------------------------------------------------------------===//
 187
 188 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 189 Perhaps use pxor / xorp* to clear a XMM register first?
 190
 191 //===---------------------------------------------------------------------===//
 192
 193 How to decide when to use the "floating point version" of logical ops? Here are
 194 some code fragments:
 195
 196         movaps LCPI5_5, %xmm2
 197         divps %xmm1, %xmm2
 198         mulps %xmm2, %xmm3
 199         mulps 8656(%ecx), %xmm3
 200         addps 8672(%ecx), %xmm3
 201         andps LCPI5_6, %xmm2
 202         andps LCPI5_1, %xmm3
 203         por %xmm2, %xmm3
 204         movdqa %xmm3, (%edi)
 205
 206         movaps LCPI5_5, %xmm1
 207         divps %xmm0, %xmm1
 208         mulps %xmm1, %xmm3
 209         mulps 8656(%ecx), %xmm3
 210         addps 8672(%ecx), %xmm3
 211         andps LCPI5_6, %xmm1
 212         andps LCPI5_1, %xmm3
 213         orps %xmm1, %xmm3
 214         movaps %xmm3, 112(%esp)
 215         movaps %xmm3, (%ebx)
 216
 217 Due to some minor source change, the later case ended up using orps and movaps
 218 instead of por and movdqa. Does it matter?
 219
 220 //===---------------------------------------------------------------------===//
 221
 222 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 223 to choose between movaps, movapd, and movdqa based on types of source and
 224 destination?
 225
 226 How about andps, andpd, and pand? Do we really care about the type of the packed
 227 elements? If not, why not always use the "ps" variants which are likely to be
 228 shorter.
 229
 230 //===---------------------------------------------------------------------===//
 231
 232 External test Nurbs exposed some problems. Look for
 233 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 234 emits:
 235
 236         movaps    (%edx), %xmm2                                 #59.21
 237         movaps    (%edx), %xmm5                                 #60.21
 238         movaps    (%edx), %xmm4                                 #61.21
 239         movaps    (%edx), %xmm3                                 #62.21
 240         movl      40(%ecx), %ebp                                #69.49
 241         shufps    $0, %xmm2, %xmm5                              #60.21
 242         movl      100(%esp), %ebx                               #69.20
 243         movl      (%ebx), %edi                                  #69.20
 244         imull     %ebp, %edi                                    #69.49
 245         addl      (%eax), %edi                                  #70.33
 246         shufps    $85, %xmm2, %xmm4                             #61.21
 247         shufps    $170, %xmm2, %xmm3                            #62.21
 248         shufps    $255, %xmm2, %xmm2                            #63.21
 249         lea       (%ebp,%ebp,2), %ebx                           #69.49
 250         negl      %ebx                                          #69.49
 251         lea       -3(%edi,%ebx), %ebx                           #70.33
 252         shll      $4, %ebx                                      #68.37
 253         addl      32(%ecx), %ebx                                #68.37
 254         testb     $15, %bl                                      #91.13
 255         jne       L_B1.24       # Prob 5%                       #91.13
 256
 257 This is the llvm code after instruction scheduling:
 258
 259 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 260         %reg1078 = MOV32ri -3
 261         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 262         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 263         %reg1080 = IMUL32rr %reg1079, %reg1037
 264         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 265         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 266         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 267         %reg1082 = SHL32ri %reg1038, 4
 268         %reg1039 = ADD32rr %reg1036, %reg1082
 269         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 270         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 271         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 272         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 273         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 274         %reg1040 = MOV32rr %reg1039
 275         %reg1084 = AND32ri8 %reg1039, 15
 276         CMP32ri8 %reg1084, 0
 277         JE mbb<cond_next204,0xa914d30>
 278
 279 Still ok. After register allocation:
 280
 281 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 282         %EAX = MOV32ri -3
 283         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 284         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 285         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 286         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 287         IMUL32rr %EAX<def&use>, %EDX
 288         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 289         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 290         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 291         %EAX = LEA32r %ESI, 1, %EAX, -3
 292         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 293         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 294         %EDI = MOV32rr %EAX
 295         SHL32ri %EDI<def&use>, 4
 296         ADD32rr %EDI<def&use>, %ESI
 297         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 298         %XMM1 = MOVAPSrr %XMM0
 299         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 300         %XMM2 = MOVAPSrr %XMM0
 301         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 302         %XMM3 = MOVAPSrr %XMM0
 303         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 304         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 305         %EBX = MOV32rr %EDI
 306         AND32ri8 %EBX<def&use>, 15
 307         CMP32ri8 %EBX, 0
 308         JE mbb<cond_next204,0xa914d30>
 309
 310 This looks really bad. The problem is shufps is a destructive opcode. Since it
 311 appears as operand two in more than one shufps ops. It resulted in a number of
 312 copies. Note icc also suffers from the same problem. Either the instruction
 313 selector should select pshufd or The register allocator can made the two-address
 314 to three-address transformation.
 315
 316 It also exposes some other problems. See MOV32ri -3 and the spills.
 317
 318 //===---------------------------------------------------------------------===//
 319
 320 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
 321
 322 LLVM is producing bad code.
 323
 324 LBB_main_4:     # cond_true44
 325         addps %xmm1, %xmm2
 326         subps %xmm3, %xmm2
 327         movaps (%ecx), %xmm4
 328         movaps %xmm2, %xmm1
 329         addps %xmm4, %xmm1
 330         addl $16, %ecx
 331         incl %edx
 332         cmpl $262144, %edx
 333         movaps %xmm3, %xmm2
 334         movaps %xmm4, %xmm3
 335         jne LBB_main_4  # cond_true44
 336
 337 There are two problems. 1) No need to two loop induction variables. We can
 338 compare against 262144 * 16. 2) Known register coalescer issue. We should
 339 be able eliminate one of the movaps:
 340
 341         addps %xmm2, %xmm1    <=== Commute!
 342         subps %xmm3, %xmm1
 343         movaps (%ecx), %xmm4
 344         movaps %xmm1, %xmm1   <=== Eliminate!
 345         addps %xmm4, %xmm1
 346         addl $16, %ecx
 347         incl %edx
 348         cmpl $262144, %edx
 349         movaps %xmm3, %xmm2
 350         movaps %xmm4, %xmm3
 351         jne LBB_main_4  # cond_true44
 352
 353 //===---------------------------------------------------------------------===//
 354
 355 Consider:
 356
 357 __m128 test(float a) {
 358   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 359 }
 360
 361 This compiles into:
 362
 363 movss 4(%esp), %xmm1
 364 mulss %xmm1, %xmm1
 365 xorps %xmm0, %xmm0
 366 movss %xmm1, %xmm0
 367 ret
 368
 369 Because mulss doesn't modify the top 3 elements, the top elements of
 370 xmm1 are already zero'd.  We could compile this to:
 371
 372 movss 4(%esp), %xmm0
 373 mulss %xmm0, %xmm0
 374 ret
 375
 376 //===---------------------------------------------------------------------===//
 377
 378 Here's a sick and twisted idea.  Consider code like this:
 379
 380 __m128 test(__m128 a) {
 381   float b = *(float*)&A;
 382   ...
 383   return _mm_set_ps(0.0, 0.0, 0.0, b);
 384 }
 385
 386 This might compile to this code:
 387
 388 movaps c(%esp), %xmm1
 389 xorps %xmm0, %xmm0
 390 movss %xmm1, %xmm0
 391 ret
 392
 393 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 394 this code:
 395
 396 movaps c(%esp), %xmm1
 397 movaps %xmm1, c2(%esp)
 398 ...
 399
 400 xorps %xmm0, %xmm0
 401 movaps c2(%esp), %xmm1
 402 movss %xmm1, %xmm0
 403 ret
 404
 405 However, since the reload is only used by these instructions, we could
 406 "fold" it into the uses, producing something like this:
 407
 408 movaps c(%esp), %xmm1
 409 movaps %xmm1, c2(%esp)
 410 ...
 411
 412 movss c2(%esp), %xmm0
 413 ret
 414
 415 ... saving two instructions.
 416
 417 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 418 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
 419 This can be used to simplify a variety of shuffle operations, where the
 420 elements are fixed zeros.
 421
 422 //===---------------------------------------------------------------------===//
 423
 424 For this:
 425
 426 #include <emmintrin.h>
 427 void test(__m128d *r, __m128d *A, double B) {
 428   *r = _mm_loadl_pd(*A, &B);
 429 }
 430
 431 We generates:
 432
 433         subl $12, %esp
 434         movsd 24(%esp), %xmm0
 435         movsd %xmm0, (%esp)
 436         movl 20(%esp), %eax
 437         movapd (%eax), %xmm0
 438         movlpd (%esp), %xmm0
 439         movl 16(%esp), %eax
 440         movapd %xmm0, (%eax)
 441         addl $12, %esp
 442         ret
 443
 444 icc generates:
 445
 446         movl      4(%esp), %edx                                 #3.6
 447         movl      8(%esp), %eax                                 #3.6
 448         movapd    (%eax), %xmm0                                 #4.22
 449         movlpd    12(%esp), %xmm0                               #4.8
 450         movapd    %xmm0, (%edx)                                 #4.3
 451         ret                                                     #5.1
 452
 453 So icc is smart enough to know that B is in memory so it doesn't load it and
 454 store it back to stack.
 455
 456 //===---------------------------------------------------------------------===//
 457
 458 __m128d test1( __m128d A, __m128d B) {
 459   return _mm_shuffle_pd(A, B, 0x3);
 460 }
 461
 462 compiles to
 463
 464 shufpd $3, %xmm1, %xmm0
 465
 466 Perhaps it's better to use unpckhpd instead?
 467
 468 unpckhpd %xmm1, %xmm0
 469
 470 Don't know if unpckhpd is faster. But it is shorter.
 471
 472 //===---------------------------------------------------------------------===//
 473
 474 This code generates ugly code, probably due to costs being off or something:
 475
 476 void %test(float* %P, <4 x float>* %P2 ) {
 477         %xFloat0.688 = load float* %P
 478         %loadVector37.712 = load <4 x float>* %P2
 479         %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
 480         store <4 x float> %inFloat3.713, <4 x float>* %P2
 481         ret void
 482 }
 483
 484 Generates:
 485
 486 _test:
 487         pxor %xmm0, %xmm0
 488         movd %xmm0, %eax        ;; EAX = 0!
 489         movl 8(%esp), %ecx
 490         movaps (%ecx), %xmm0
 491         pinsrw $6, %eax, %xmm0
 492         shrl $16, %eax          ;; EAX = 0 again!
 493         pinsrw $7, %eax, %xmm0
 494         movaps %xmm0, (%ecx)
 495         ret
 496
 497 It would be better to generate:
 498
 499 _test:
 500         movl 8(%esp), %ecx
 501         movaps (%ecx), %xmm0
 502         xor %eax, %eax
 503         pinsrw $6, %eax, %xmm0
 504         pinsrw $7, %eax, %xmm0
 505         movaps %xmm0, (%ecx)
 506         ret
 507
 508 or use pxor (to make a zero vector) and shuffle (to insert it).
 509
 510 //===---------------------------------------------------------------------===//
 511
 512 Some useful information in the Apple Altivec / SSE Migration Guide:
 513
 514 http://developer.apple.com/documentation/Performance/Conceptual/
 515 Accelerate_sse_migration/index.html
 516
 517 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 518
 519 //===---------------------------------------------------------------------===//
 520
 521 Add hooks to commute some CMPP operations.
 522
 523 //===---------------------------------------------------------------------===//
 524
 525 Apply the same transformation that merged four float into a single 128-bit load
 526 to loads from constant pool.
 527
 528 //===---------------------------------------------------------------------===//
 529
 530 Floating point max / min are commutable when -enable-unsafe-fp-path is
 531 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 532 nodes which are selected to max / min instructions that are marked commutable.
 533
 534 //===---------------------------------------------------------------------===//
 535
 536 We should compile this:
 537 #include <xmmintrin.h>
 538 typedef union {
 539   int i[4];
 540   float f[4];
 541   __m128 v;
 542 } vector4_t;
 543 void swizzle (const void *a, vector4_t * b, vector4_t * c) {
 544   b->v = _mm_loadl_pi (b->v, (__m64 *) a);
 545   c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
 546 }
 547
 548 to:
 549
 550 _swizzle:
 551         movl    4(%esp), %eax
 552         movl    8(%esp), %edx
 553         movl    12(%esp), %ecx
 554         movlps  (%eax), %xmm0
 555         movlps  %xmm0, (%edx)
 556         movlps  8(%eax), %xmm0
 557         movlps  %xmm0, (%ecx)
 558         ret
 559
 560 not:
 561
 562 swizzle:
 563         movl 8(%esp), %eax
 564         movaps (%eax), %xmm0
 565         movl 4(%esp), %ecx
 566         movlps (%ecx), %xmm0
 567         movaps %xmm0, (%eax)
 568         movl 12(%esp), %eax
 569         movaps (%eax), %xmm0
 570         movlps 8(%ecx), %xmm0
 571         movaps %xmm0, (%eax)
 572         ret
 573
 574