lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 //===---------------------------------------------------------------------===//
   6
   7 SSE Variable shift can be custom lowered to something like this, which uses a
   8 small table + unaligned load + shuffle instead of going through memory.
   9
  10 __m128i_shift_right:
  11         .byte     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
  12         .byte    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  13
  14 ...
  15 __m128i shift_right(__m128i value, unsigned long offset) {
  16   return _mm_shuffle_epi8(value,
  17                _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
  18 }
  19
  20 //===---------------------------------------------------------------------===//
  21
  22 Expand libm rounding functions inline:  Significant speedups possible.
  23 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  24
  25 //===---------------------------------------------------------------------===//
  26
  27 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  28 other fast SSE modes.
  29
  30 //===---------------------------------------------------------------------===//
  31
  32 Think about doing i64 math in SSE regs on x86-32.
  33
  34 //===---------------------------------------------------------------------===//
  35
  36 This testcase should have no SSE instructions in it, and only one load from
  37 a constant pool:
  38
  39 double %test3(bool %B) {
  40         %C = select bool %B, double 123.412, double 523.01123123
  41         ret double %C
  42 }
  43
  44 Currently, the select is being lowered, which prevents the dag combiner from
  45 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  46
  47 The pattern isel got this one right.
  48
  49 //===---------------------------------------------------------------------===//
  50
  51 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  52 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  53
  54 double %X(double %Y, double %Z, double %A, double %B) {
  55         %C = setlt double %A, %B
  56         %z = fadd double %Z, 0.0    ;; select operand is not a load
  57         %D = select bool %C, double %Y, double %z
  58         ret double %D
  59 }
  60
  61 We currently emit:
  62
  63 _X:
  64         subl $12, %esp
  65         xorpd %xmm0, %xmm0
  66         addsd 24(%esp), %xmm0
  67         movsd 32(%esp), %xmm1
  68         movsd 16(%esp), %xmm2
  69         ucomisd 40(%esp), %xmm1
  70         jb LBB_X_2
  71 LBB_X_1:
  72         movsd %xmm0, %xmm2
  73 LBB_X_2:
  74         movsd %xmm2, (%esp)
  75         fldl (%esp)
  76         addl $12, %esp
  77         ret
  78
  79 //===---------------------------------------------------------------------===//
  80
  81 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
  82 feasible.
  83
  84 //===---------------------------------------------------------------------===//
  85
  86 Codegen:
  87   if (copysign(1.0, x) == copysign(1.0, y))
  88 into:
  89   if (x^y & mask)
  90 when using SSE.
  91
  92 //===---------------------------------------------------------------------===//
  93
  94 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
  95 of a v4sf value.
  96
  97 //===---------------------------------------------------------------------===//
  98
  99 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 100 Perhaps use pxor / xorp* to clear a XMM register first?
 101
 102 //===---------------------------------------------------------------------===//
 103
 104 External test Nurbs exposed some problems. Look for
 105 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 106 emits:
 107
 108         movaps    (%edx), %xmm2                                 #59.21
 109         movaps    (%edx), %xmm5                                 #60.21
 110         movaps    (%edx), %xmm4                                 #61.21
 111         movaps    (%edx), %xmm3                                 #62.21
 112         movl      40(%ecx), %ebp                                #69.49
 113         shufps    $0, %xmm2, %xmm5                              #60.21
 114         movl      100(%esp), %ebx                               #69.20
 115         movl      (%ebx), %edi                                  #69.20
 116         imull     %ebp, %edi                                    #69.49
 117         addl      (%eax), %edi                                  #70.33
 118         shufps    $85, %xmm2, %xmm4                             #61.21
 119         shufps    $170, %xmm2, %xmm3                            #62.21
 120         shufps    $255, %xmm2, %xmm2                            #63.21
 121         lea       (%ebp,%ebp,2), %ebx                           #69.49
 122         negl      %ebx                                          #69.49
 123         lea       -3(%edi,%ebx), %ebx                           #70.33
 124         shll      $4, %ebx                                      #68.37
 125         addl      32(%ecx), %ebx                                #68.37
 126         testb     $15, %bl                                      #91.13
 127         jne       L_B1.24       # Prob 5%                       #91.13
 128
 129 This is the llvm code after instruction scheduling:
 130
 131 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 132         %reg1078 = MOV32ri -3
 133         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 134         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 135         %reg1080 = IMUL32rr %reg1079, %reg1037
 136         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 137         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 138         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 139         %reg1082 = SHL32ri %reg1038, 4
 140         %reg1039 = ADD32rr %reg1036, %reg1082
 141         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 142         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 143         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 144         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 145         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 146         %reg1040 = MOV32rr %reg1039
 147         %reg1084 = AND32ri8 %reg1039, 15
 148         CMP32ri8 %reg1084, 0
 149         JE mbb<cond_next204,0xa914d30>
 150
 151 Still ok. After register allocation:
 152
 153 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 154         %EAX = MOV32ri -3
 155         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 156         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 157         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 158         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 159         IMUL32rr %EAX<def&use>, %EDX
 160         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 161         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 162         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 163         %EAX = LEA32r %ESI, 1, %EAX, -3
 164         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 165         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 166         %EDI = MOV32rr %EAX
 167         SHL32ri %EDI<def&use>, 4
 168         ADD32rr %EDI<def&use>, %ESI
 169         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 170         %XMM1 = MOVAPSrr %XMM0
 171         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 172         %XMM2 = MOVAPSrr %XMM0
 173         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 174         %XMM3 = MOVAPSrr %XMM0
 175         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 176         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 177         %EBX = MOV32rr %EDI
 178         AND32ri8 %EBX<def&use>, 15
 179         CMP32ri8 %EBX, 0
 180         JE mbb<cond_next204,0xa914d30>
 181
 182 This looks really bad. The problem is shufps is a destructive opcode. Since it
 183 appears as operand two in more than one shufps ops. It resulted in a number of
 184 copies. Note icc also suffers from the same problem. Either the instruction
 185 selector should select pshufd or The register allocator can made the two-address
 186 to three-address transformation.
 187
 188 It also exposes some other problems. See MOV32ri -3 and the spills.
 189
 190 //===---------------------------------------------------------------------===//
 191
 192 Consider:
 193
 194 __m128 test(float a) {
 195   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 196 }
 197
 198 This compiles into:
 199
 200 movss 4(%esp), %xmm1
 201 mulss %xmm1, %xmm1
 202 xorps %xmm0, %xmm0
 203 movss %xmm1, %xmm0
 204 ret
 205
 206 Because mulss doesn't modify the top 3 elements, the top elements of
 207 xmm1 are already zero'd.  We could compile this to:
 208
 209 movss 4(%esp), %xmm0
 210 mulss %xmm0, %xmm0
 211 ret
 212
 213 //===---------------------------------------------------------------------===//
 214
 215 Here's a sick and twisted idea.  Consider code like this:
 216
 217 __m128 test(__m128 a) {
 218   float b = *(float*)&A;
 219   ...
 220   return _mm_set_ps(0.0, 0.0, 0.0, b);
 221 }
 222
 223 This might compile to this code:
 224
 225 movaps c(%esp), %xmm1
 226 xorps %xmm0, %xmm0
 227 movss %xmm1, %xmm0
 228 ret
 229
 230 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 231 this code:
 232
 233 movaps c(%esp), %xmm1
 234 movaps %xmm1, c2(%esp)
 235 ...
 236
 237 xorps %xmm0, %xmm0
 238 movaps c2(%esp), %xmm1
 239 movss %xmm1, %xmm0
 240 ret
 241
 242 However, since the reload is only used by these instructions, we could
 243 "fold" it into the uses, producing something like this:
 244
 245 movaps c(%esp), %xmm1
 246 movaps %xmm1, c2(%esp)
 247 ...
 248
 249 movss c2(%esp), %xmm0
 250 ret
 251
 252 ... saving two instructions.
 253
 254 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 255 chunk is used, bring in 3 zeros the one element instead of 4 elements.
 256 This can be used to simplify a variety of shuffle operations, where the
 257 elements are fixed zeros.
 258
 259 //===---------------------------------------------------------------------===//
 260
 261 This code generates ugly code, probably due to costs being off or something:
 262
 263 define void @test(float* %P, <4 x float>* %P2 ) {
 264         %xFloat0.688 = load float* %P
 265         %tmp = load <4 x float>* %P2
 266         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
 267         store <4 x float> %inFloat3.713, <4 x float>* %P2
 268         ret void
 269 }
 270
 271 Generates:
 272
 273 _test:
 274         movl    8(%esp), %eax
 275         movaps  (%eax), %xmm0
 276         pxor    %xmm1, %xmm1
 277         movaps  %xmm0, %xmm2
 278         shufps  $50, %xmm1, %xmm2
 279         shufps  $132, %xmm2, %xmm0
 280         movaps  %xmm0, (%eax)
 281         ret
 282
 283 Would it be better to generate:
 284
 285 _test:
 286         movl 8(%esp), %ecx
 287         movaps (%ecx), %xmm0
 288         xor %eax, %eax
 289         pinsrw $6, %eax, %xmm0
 290         pinsrw $7, %eax, %xmm0
 291         movaps %xmm0, (%ecx)
 292         ret
 293
 294 ?
 295
 296 //===---------------------------------------------------------------------===//
 297
 298 Some useful information in the Apple Altivec / SSE Migration Guide:
 299
 300 http://developer.apple.com/documentation/Performance/Conceptual/
 301 Accelerate_sse_migration/index.html
 302
 303 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 304
 305 //===---------------------------------------------------------------------===//
 306
 307 Add hooks to commute some CMPP operations.
 308
 309 //===---------------------------------------------------------------------===//
 310
 311 Apply the same transformation that merged four float into a single 128-bit load
 312 to loads from constant pool.
 313
 314 //===---------------------------------------------------------------------===//
 315
 316 Floating point max / min are commutable when -enable-unsafe-fp-path is
 317 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 318 nodes which are selected to max / min instructions that are marked commutable.
 319
 320 //===---------------------------------------------------------------------===//
 321
 322 We should materialize vector constants like "all ones" and "signbit" with
 323 code like:
 324
 325      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 326
 327 and:
 328      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 329      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
 330
 331 instead of using a load from the constant pool.  The later is important for
 332 ABS/NEG/copysign etc.
 333
 334 //===---------------------------------------------------------------------===//
 335
 336 These functions:
 337
 338 #include <xmmintrin.h>
 339 __m128i a;
 340 void x(unsigned short n) {
 341   a = _mm_slli_epi32 (a, n);
 342 }
 343 void y(unsigned n) {
 344   a = _mm_slli_epi32 (a, n);
 345 }
 346
 347 compile to ( -O3 -static -fomit-frame-pointer):
 348 _x:
 349         movzwl  4(%esp), %eax
 350         movd    %eax, %xmm0
 351         movaps  _a, %xmm1
 352         pslld   %xmm0, %xmm1
 353         movaps  %xmm1, _a
 354         ret
 355 _y:
 356         movd    4(%esp), %xmm0
 357         movaps  _a, %xmm1
 358         pslld   %xmm0, %xmm1
 359         movaps  %xmm1, _a
 360         ret
 361
 362 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
 363 like movd would be sufficient in both cases as the value is already zero
 364 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
 365 save, as a really-signed value would be undefined for pslld.
 366
 367
 368 //===---------------------------------------------------------------------===//
 369
 370 #include <math.h>
 371 int t1(double d) { return signbit(d); }
 372
 373 This currently compiles to:
 374         subl    $12, %esp
 375         movsd   16(%esp), %xmm0
 376         movsd   %xmm0, (%esp)
 377         movl    4(%esp), %eax
 378         shrl    $31, %eax
 379         addl    $12, %esp
 380         ret
 381
 382 We should use movmskp{s|d} instead.
 383
 384 //===---------------------------------------------------------------------===//
 385
 386 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
 387 (aligned) vector load.  This functionality has a couple of problems.
 388
 389 1. The code to infer alignment from loads of globals is in the X86 backend,
 390    not the dag combiner.  This is because dagcombine2 needs to be able to see
 391    through the X86ISD::Wrapper node, which DAGCombine can't really do.
 392 2. The code for turning 4 x load into a single vector load is target
 393    independent and should be moved to the dag combiner.
 394 3. The code for turning 4 x load into a vector load can only handle a direct
 395    load from a global or a direct load from the stack.  It should be generalized
 396    to handle any load from P, P+4, P+8, P+12, where P can be anything.
 397 4. The alignment inference code cannot handle loads from globals in non-static
 398    mode because it doesn't look through the extra dyld stub load.  If you try
 399    vec_align.ll without -relocation-model=static, you'll see what I mean.
 400
 401 //===---------------------------------------------------------------------===//
 402
 403 We should lower store(fneg(load p), q) into an integer load+xor+store, which
 404 eliminates a constant pool load.  For example, consider:
 405
 406 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 407 entry:
 408  %tmp6 = fsub float -0.000000e+00, %z.1         ; <float> [#uses=1]
 409  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
 410  ret i64 %tmp20
 411 }
 412 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
 413
 414 This currently compiles to:
 415
 416 LCPI1_0:                                        #  <4 x float>
 417         .long   2147483648      # float -0
 418         .long   2147483648      # float -0
 419         .long   2147483648      # float -0
 420         .long   2147483648      # float -0
 421 _ccosf:
 422         subl    $12, %esp
 423         movss   16(%esp), %xmm0
 424         movss   %xmm0, 4(%esp)
 425         movss   20(%esp), %xmm0
 426         xorps   LCPI1_0, %xmm0
 427         movss   %xmm0, (%esp)
 428         call    L_ccoshf$stub
 429         addl    $12, %esp
 430         ret
 431
 432 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
 433 this code computes the pic base and does two loads to do the constant pool
 434 load, so the improvement is much bigger.
 435
 436 The tricky part about this xform is that the argument load/store isn't exposed
 437 until post-legalize, and at that point, the fneg has been custom expanded into
 438 an X86 fxor.  This means that we need to handle this case in the x86 backend
 439 instead of in target independent code.
 440
 441 //===---------------------------------------------------------------------===//
 442
 443 Non-SSE4 insert into 16 x i8 is atrociously bad.
 444
 445 //===---------------------------------------------------------------------===//
 446
 447 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
 448 is memory.
 449
 450 //===---------------------------------------------------------------------===//
 451
 452 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
 453 sitting between the truncate and the extract.
 454
 455 //===---------------------------------------------------------------------===//
 456
 457 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
 458 any number of 0.0 simultaneously.  Currently we only use it for simple
 459 insertions.
 460
 461 See comments in LowerINSERT_VECTOR_ELT_SSE4.
 462
 463 //===---------------------------------------------------------------------===//
 464
 465 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
 466 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
 467 legal, it'll just take a few extra patterns written in the .td file.
 468
 469 Note: this is not a code quality issue; the custom lowered code happens to be
 470 right, but we shouldn't have to custom lower anything.  This is probably related
 471 to <2 x i64> ops being so bad.
 472
 473 //===---------------------------------------------------------------------===//
 474
 475 'select' on vectors and scalars could be a whole lot better.  We currently
 476 lower them to conditional branches.  On x86-64 for example, we compile this:
 477
 478 double test(double a, double b, double c, double d) { return a<b ? c : d; }
 479
 480 to:
 481
 482 _test:
 483         ucomisd %xmm0, %xmm1
 484         ja      LBB1_2  # entry
 485 LBB1_1: # entry
 486         movapd  %xmm3, %xmm2
 487 LBB1_2: # entry
 488         movapd  %xmm2, %xmm0
 489         ret
 490
 491 instead of:
 492
 493 _test:
 494         cmpltsd %xmm1, %xmm0
 495         andpd   %xmm0, %xmm2
 496         andnpd  %xmm3, %xmm0
 497         orpd    %xmm2, %xmm0
 498         ret
 499
 500 For unpredictable branches, the later is much more efficient.  This should
 501 just be a matter of having scalar sse map to SELECT_CC and custom expanding
 502 or iseling it.
 503
 504 //===---------------------------------------------------------------------===//
 505
 506 LLVM currently generates stack realignment code, when it is not necessary
 507 needed. The problem is that we need to know about stack alignment too early,
 508 before RA runs.
 509
 510 At that point we don't know, whether there will be vector spill, or not.
 511 Stack realignment logic is overly conservative here, but otherwise we can
 512 produce unaligned loads/stores.
 513
 514 Fixing this will require some huge RA changes.
 515
 516 Testcase:
 517 #include <emmintrin.h>
 518
 519 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
 520
 521 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
 522 - 22725, - 12873};;
 523
 524 vSInt16 madd(vSInt16 b)
 525 {
 526     return _mm_madd_epi16(a, b);
 527 }
 528
 529 Generated code (x86-32, linux):
 530 madd:
 531         pushl   %ebp
 532         movl    %esp, %ebp
 533         andl    $-16, %esp
 534         movaps  .LCPI1_0, %xmm1
 535         pmaddwd %xmm1, %xmm0
 536         movl    %ebp, %esp
 537         popl    %ebp
 538         ret
 539
 540 //===---------------------------------------------------------------------===//
 541
 542 Consider:
 543 #include <emmintrin.h>
 544 __m128 foo2 (float x) {
 545  return _mm_set_ps (0, 0, x, 0);
 546 }
 547
 548 In x86-32 mode, we generate this spiffy code:
 549
 550 _foo2:
 551         movss   4(%esp), %xmm0
 552         pshufd  $81, %xmm0, %xmm0
 553         ret
 554
 555 in x86-64 mode, we generate this code, which could be better:
 556
 557 _foo2:
 558         xorps   %xmm1, %xmm1
 559         movss   %xmm0, %xmm1
 560         pshufd  $81, %xmm1, %xmm0
 561         ret
 562
 563 In sse4 mode, we could use insertps to make both better.
 564
 565 Here's another testcase that could use insertps [mem]:
 566
 567 #include <xmmintrin.h>
 568 extern float x2, x3;
 569 __m128 foo1 (float x1, float x4) {
 570  return _mm_set_ps (x2, x1, x3, x4);
 571 }
 572
 573 gcc mainline compiles it to:
 574
 575 foo1:
 576        insertps        $0x10, x2(%rip), %xmm0
 577        insertps        $0x10, x3(%rip), %xmm1
 578        movaps  %xmm1, %xmm2
 579        movlhps %xmm0, %xmm2
 580        movaps  %xmm2, %xmm0
 581        ret
 582
 583 //===---------------------------------------------------------------------===//
 584
 585 We compile vector multiply-by-constant into poor code:
 586
 587 define <4 x i32> @f(<4 x i32> %i) nounwind  {
 588         %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
 589         ret <4 x i32> %A
 590 }
 591
 592 On targets without SSE4.1, this compiles into:
 593
 594 LCPI1_0:                                        ##  <4 x i32>
 595         .long   10
 596         .long   10
 597         .long   10
 598         .long   10
 599         .text
 600         .align  4,0x90
 601         .globl  _f
 602 _f:
 603         pshufd  $3, %xmm0, %xmm1
 604         movd    %xmm1, %eax
 605         imull   LCPI1_0+12, %eax
 606         movd    %eax, %xmm1
 607         pshufd  $1, %xmm0, %xmm2
 608         movd    %xmm2, %eax
 609         imull   LCPI1_0+4, %eax
 610         movd    %eax, %xmm2
 611         punpckldq       %xmm1, %xmm2
 612         movd    %xmm0, %eax
 613         imull   LCPI1_0, %eax
 614         movd    %eax, %xmm1
 615         movhlps %xmm0, %xmm0
 616         movd    %xmm0, %eax
 617         imull   LCPI1_0+8, %eax
 618         movd    %eax, %xmm0
 619         punpckldq       %xmm0, %xmm1
 620         movaps  %xmm1, %xmm0
 621         punpckldq       %xmm2, %xmm0
 622         ret
 623
 624 It would be better to synthesize integer vector multiplication by constants
 625 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
 626 simple cases such as multiplication by powers of two would be better as
 627 vector shifts than as multiplications.
 628
 629 //===---------------------------------------------------------------------===//
 630
 631 We compile this:
 632
 633 __m128i
 634 foo2 (char x)
 635 {
 636   return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
 637 }
 638
 639 into:
 640         movl    $1, %eax
 641         xorps   %xmm0, %xmm0
 642         pinsrw  $2, %eax, %xmm0
 643         movzbl  4(%esp), %eax
 644         pinsrw  $3, %eax, %xmm0
 645         movl    $256, %eax
 646         pinsrw  $7, %eax, %xmm0
 647         ret
 648
 649
 650 gcc-4.2:
 651         subl    $12, %esp
 652         movzbl  16(%esp), %eax
 653         movdqa  LC0, %xmm0
 654         pinsrw  $3, %eax, %xmm0
 655         addl    $12, %esp
 656         ret
 657         .const
 658         .align 4
 659 LC0:
 660         .word   0
 661         .word   0
 662         .word   1
 663         .word   0
 664         .word   0
 665         .word   0
 666         .word   0
 667         .word   256
 668
 669 With SSE4, it should be
 670       movdqa  .LC0(%rip), %xmm0
 671       pinsrb  $6, %edi, %xmm0
 672
 673 //===---------------------------------------------------------------------===//
 674
 675 We should transform a shuffle of two vectors of constants into a single vector
 676 of constants. Also, insertelement of a constant into a vector of constants
 677 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
 678
 679 We compiled it to something horrible:
 680
 681         .align  4
 682 LCPI1_1:                                        ##  float
 683         .long   1065353216      ## float 1
 684         .const
 685
 686         .align  4
 687 LCPI1_0:                                        ##  <4 x float>
 688         .space  4
 689         .long   1065353216      ## float 1
 690         .space  4
 691         .long   1065353216      ## float 1
 692         .text
 693         .align  4,0x90
 694         .globl  _t
 695 _t:
 696         xorps   %xmm0, %xmm0
 697         movhps  LCPI1_0, %xmm0
 698         movss   LCPI1_1, %xmm1
 699         movaps  %xmm0, %xmm2
 700         shufps  $2, %xmm1, %xmm2
 701         shufps  $132, %xmm2, %xmm0
 702         movaps  %xmm0, 0
 703
 704 //===---------------------------------------------------------------------===//
 705 rdar://5907648
 706
 707 This function:
 708
 709 float foo(unsigned char x) {
 710   return x;
 711 }
 712
 713 compiles to (x86-32):
 714
 715 define float @foo(i8 zeroext  %x) nounwind  {
 716         %tmp12 = uitofp i8 %x to float          ; <float> [#uses=1]
 717         ret float %tmp12
 718 }
 719
 720 compiles to:
 721
 722 _foo:
 723         subl    $4, %esp
 724         movzbl  8(%esp), %eax
 725         cvtsi2ss        %eax, %xmm0
 726         movss   %xmm0, (%esp)
 727         flds    (%esp)
 728         addl    $4, %esp
 729         ret
 730
 731 We should be able to use:
 732   cvtsi2ss 8($esp), %xmm0
 733 since we know the stack slot is already zext'd.
 734
 735 //===---------------------------------------------------------------------===//
 736
 737 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
 738 when code size is critical. movlps is slower than movsd on core2 but it's one
 739 byte shorter.
 740
 741 //===---------------------------------------------------------------------===//
 742
 743 We should use a dynamic programming based approach to tell when using FPStack
 744 operations is cheaper than SSE.  SciMark montecarlo contains code like this
 745 for example:
 746
 747 double MonteCarlo_num_flops(int Num_samples) {
 748     return ((double) Num_samples)* 4.0;
 749 }
 750
 751 In fpstack mode, this compiles into:
 752
 753 LCPI1_0:
 754         .long   1082130432      ## float 4.000000e+00
 755 _MonteCarlo_num_flops:
 756         subl    $4, %esp
 757         movl    8(%esp), %eax
 758         movl    %eax, (%esp)
 759         fildl   (%esp)
 760         fmuls   LCPI1_0
 761         addl    $4, %esp
 762         ret
 763
 764 in SSE mode, it compiles into significantly slower code:
 765
 766 _MonteCarlo_num_flops:
 767         subl    $12, %esp
 768         cvtsi2sd        16(%esp), %xmm0
 769         mulsd   LCPI1_0, %xmm0
 770         movsd   %xmm0, (%esp)
 771         fldl    (%esp)
 772         addl    $12, %esp
 773         ret
 774
 775 There are also other cases in scimark where using fpstack is better, it is
 776 cheaper to do fld1 than load from a constant pool for example, so
 777 "load, add 1.0, store" is better done in the fp stack, etc.
 778
 779 //===---------------------------------------------------------------------===//
 780
 781 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
 782 "cmpsd".  For example, this code:
 783
 784 double d1(double x) { return x == x ? x : x + x; }
 785
 786 Compiles into:
 787
 788 _d1:
 789         ucomisd %xmm0, %xmm0
 790         jnp     LBB1_2
 791         addsd   %xmm0, %xmm0
 792         ret
 793 LBB1_2:
 794         ret
 795
 796 Also, the 'ret's should be shared.  This is PR6032.
 797
 798 //===---------------------------------------------------------------------===//
 799
 800 These should compile into the same code (PR6214): Perhaps instcombine should
 801 canonicalize the former into the later?
 802
 803 define float @foo(float %x) nounwind {
 804   %t = bitcast float %x to i32
 805   %s = and i32 %t, 2147483647
 806   %d = bitcast i32 %s to float
 807   ret float %d
 808 }
 809
 810 declare float @fabsf(float %n)
 811 define float @bar(float %x) nounwind {
 812   %d = call float @fabsf(float %x)
 813   ret float %d
 814 }
 815
 816 //===---------------------------------------------------------------------===//
 817
 818 This IR (from PR6194):
 819
 820 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 821 target triple = "x86_64-apple-darwin10.0.0"
 822
 823 %0 = type { double, double }
 824 %struct.float3 = type { float, float, float }
 825
 826 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
 827 entry:
 828   %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
 829   %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
 830   %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
 831   %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
 832   %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
 833   %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
 834   %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
 835   store float %tmp12, float* %tmp5
 836   ret void
 837 }
 838
 839 Compiles to:
 840
 841 _test:                                  ## @test
 842         movd    %xmm0, %rax
 843         shrq    $32, %rax
 844         movl    %eax, 4(%rdi)
 845         ret
 846
 847 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
 848 doing a shuffle from v[1] to v[0] then a float store.
 849
 850 //===---------------------------------------------------------------------===//
 851
 852 On SSE4 machines, we compile this code:
 853
 854 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
 855        <2 x float> *%P) nounwind {
 856   %Z = fadd <2 x float> %Q, %R
 857
 858   store <2 x float> %Z, <2 x float> *%P
 859   ret <2 x float> %Z
 860 }
 861
 862 into:
 863
 864 _test2:                                 ## @test2
 865 ## BB#0:
 866         insertps        $0, %xmm2, %xmm2
 867         insertps        $16, %xmm3, %xmm2
 868         insertps        $0, %xmm0, %xmm3
 869         insertps        $16, %xmm1, %xmm3
 870         addps   %xmm2, %xmm3
 871         movq    %xmm3, (%rdi)
 872         movaps  %xmm3, %xmm0
 873         pshufd  $1, %xmm3, %xmm1
 874                                         ## kill: XMM1<def> XMM1<kill>
 875         ret
 876
 877 The insertps's of $0 are pointless complex copies.
 878
 879 //===---------------------------------------------------------------------===//
 880
 881